From c61cb13ae42e3a170c29d4710ea2fc484081ee96 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:01:44 -0800 Subject: pdftrio proposal and start on schema+kafka --- kafka/topics.md | 8 +++ proposals/20200207_pdftrio.md | 101 +++++++++++++++++++++++++++ sql/migrations/2019-12-19-060141_init/up.sql | 13 ++++ 3 files changed, 122 insertions(+) create mode 100644 proposals/20200207_pdftrio.md diff --git a/kafka/topics.md b/kafka/topics.md index 2735d51..0ce8610 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -41,6 +41,12 @@ retention (on both a size and time basis). => 6 partitions => can't think of a good key, so none; no compaction + sandcrawler-ENV.pdftrio-output + => output of each pdftrio ML classification + => schema is JSON; see pdftrio proposal for fields. small objects. + => 6 partitions + => key is sha1hex of PDF; enable key compaction + fatcat-ENV.api-crossref fatcat-ENV.api-datacite => all new and updated DOIs (regardless of type) @@ -119,6 +125,8 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03 ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md new file mode 100644 index 0000000..b1b09f9 --- /dev/null +++ b/proposals/20200207_pdftrio.md @@ -0,0 +1,101 @@ + +status: in progress + +PDF Trio (ML Classification) +============================== + +This document describes how we intent to integrate the first generation of PDF +classification work into the sandcrawler processing system. + +- abstractions (APIs) +- schemas +- how models and dependencies are deployed +- what code is release where under what license + + +## Code Structure + +Major components: + +**Training code, documentation, datasets:** Not used at run-time (does not need +to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we +should only release URL lists that point to wayback. + +**Models:** all are static, uploaded to archive.org items, simple download to +deploy. Should be versioned, and have unique versioned file names or directory +paths (aka, deploy in parallel). + +**Image classifier backend:** vanilla tensorflow serving docker image, with a +bunch of invocation configs, plus static models. + +**BERT backend:** vanilla tensorflow serving docker image, plus config, plus +models. Basically same as image classifier. + +**API service:** currently Flask. Depends on tools like imagemagik, fasttext, +pdftotext. Seems like apt+pipenv should work? + + +## API Refactors + +Changes: + +- probably re-write README? +- refactor python code into directories +- add python tests +- tweak schema +- proper parallelization: uwsgi? async? + +New features: + +- option to send images, raw text in batches in addition to PDFs. + +## Client Code + +Basically just like GROBID client for now. Requests, JSON. + +## JSON Schema + +Output that goes in Kafka topic: + + pdftrio + status + status_code + ensemble_score + bert_score + image_score + linear_score + versions + pdftrio_version (string) + models_date (string, ISO date) + git_rev (string) + bert_model (string) + image_model (string) + linear_model (string) + timing + ... (might be added?) + file_meta + sha1hex + ... + + +## SQL Schema + +Ensemble model versions are summarized as a date. + + CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL + ); + +## Kafka Topic + +sandcrawler-qa.pdftrio-output + diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 0b2b19c..a27796b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -74,6 +74,19 @@ CREATE TABLE IF NOT EXISTS grobid ( ); -- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); +CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), -- cgit v1.2.3