diff options
| -rw-r--r-- | kafka/topics.md | 8 | ||||
| -rw-r--r-- | proposals/20200207_pdftrio.md | 101 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 13 | 
3 files changed, 122 insertions, 0 deletions
diff --git a/kafka/topics.md b/kafka/topics.md index 2735d51..0ce8610 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -41,6 +41,12 @@ retention (on both a size and time basis).          => 6 partitions          => can't think of a good key, so none; no compaction +    sandcrawler-ENV.pdftrio-output +        => output of each pdftrio ML classification +        => schema is JSON; see pdftrio proposal for fields. small objects. +        => 6 partitions +        => key is sha1hex of PDF; enable key compaction +      fatcat-ENV.api-crossref      fatcat-ENV.api-datacite          => all new and updated DOIs (regardless of type) @@ -119,6 +125,8 @@ exists`; this seems safe, and the settings won't be over-ridden.      ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete      ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions  6 --topic sandcrawler-qa.ingest-file-results +    ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact +      ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog      ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03      ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md new file mode 100644 index 0000000..b1b09f9 --- /dev/null +++ b/proposals/20200207_pdftrio.md @@ -0,0 +1,101 @@ + +status: in progress + +PDF Trio (ML Classification) +============================== + +This document describes how we intent to integrate the first generation of PDF +classification work into the sandcrawler processing system. + +- abstractions (APIs) +- schemas +- how models and dependencies are deployed +- what code is release where under what license + + +## Code Structure + +Major components: + +**Training code, documentation, datasets:** Not used at run-time (does not need +to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we +should only release URL lists that point to wayback. + +**Models:** all are static, uploaded to archive.org items, simple download to +deploy. Should be versioned, and have unique versioned file names or directory +paths (aka, deploy in parallel). + +**Image classifier backend:** vanilla tensorflow serving docker image, with a +bunch of invocation configs, plus static models. + +**BERT backend:** vanilla tensorflow serving docker image, plus config, plus +models. Basically same as image classifier. + +**API service:** currently Flask. Depends on tools like imagemagik, fasttext, +pdftotext. Seems like apt+pipenv should work? + + +## API Refactors + +Changes: + +- probably re-write README? +- refactor python code into directories +- add python tests +- tweak schema +- proper parallelization: uwsgi? async? + +New features: + +- option to send images, raw text in batches in addition to PDFs. + +## Client Code + +Basically just like GROBID client for now. Requests, JSON. + +## JSON Schema + +Output that goes in Kafka topic: + +    pdftrio +        status +        status_code +        ensemble_score +        bert_score +        image_score +        linear_score +        versions +            pdftrio_version (string) +            models_date (string, ISO date) +            git_rev (string) +            bert_model (string) +            image_model (string) +            linear_model (string) +        timing +            ... (might be added?) +    file_meta +        sha1hex +        ... + + +## SQL Schema + +Ensemble model versions are summarized as a date. + +    CREATE TABLE IF NOT EXISTS pdftrio ( +        sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +        updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +        status_code         INT NOT NULL, +        status              TEXT CHECK (octet_length(status) >= 1) NOT NULL, +        pdftrio_version     TEXT CHECK (octet_length(pdftrio_version) >= 1), +        models_date         DATE, +        ensemble_score      REAL, +        bert_score          REAL, +        linear_score        REAL, +        image_score         REAL +    ); + +## Kafka Topic + +sandcrawler-qa.pdftrio-output + diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 0b2b19c..a27796b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -74,6 +74,19 @@ CREATE TABLE IF NOT EXISTS grobid (  );  -- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); +CREATE TABLE IF NOT EXISTS pdftrio ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    status_code         INT NOT NULL, +    status              TEXT CHECK (octet_length(status) >= 1) NOT NULL, +    pdftrio_version     TEXT CHECK (octet_length(pdftrio_version) >= 1), +    models_date         DATE, +    ensemble_score      REAL, +    bert_score          REAL, +    linear_score        REAL, +    image_score         REAL +); +  CREATE TABLE IF NOT EXISTS ingest_request (      link_source             TEXT NOT NULL CHECK (octet_length(link_source) >= 1),      link_source_id          TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),  | 
