pdftrio proposal and start on schema+kafka

author: Bryan Newbold <bnewbold@archive.org> 2020-02-12 19:01:44 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-02-12 19:01:44 -0800
commit: c61cb13ae42e3a170c29d4710ea2fc484081ee96 (patch)
tree: 7db2a372a72b69126341d04cc010a732b4cec46c
parent: c32d64f7a7b9e01ceb4c3dc161e0ab267cf63654 (diff)
download: sandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.tar.gz
sandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.zip
3 files changed, 122 insertions, 0 deletions
diff --git a/kafka/topics.md b/kafka/topics.md
index 2735d51..0ce8610 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -41,6 +41,12 @@ retention (on both a size and time basis).
         => 6 partitions
         => can't think of a good key, so none; no compaction
 
+    sandcrawler-ENV.pdftrio-output
+        => output of each pdftrio ML classification
+        => schema is JSON; see pdftrio proposal for fields. small objects.
+        => 6 partitions
+        => key is sha1hex of PDF; enable key compaction
+
     fatcat-ENV.api-crossref
     fatcat-ENV.api-datacite
         => all new and updated DOIs (regardless of type)
@@ -119,6 +125,8 @@ exists`; this seems safe, and the settings won't be over-ridden.
     ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete
     ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions  6 --topic sandcrawler-qa.ingest-file-results
 
+    ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact
+
     ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog
     ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03
     ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
new file mode 100644
index 0000000..b1b09f9
--- /dev/null
+++ b/proposals/20200207_pdftrio.md
@@ -0,0 +1,101 @@
+
+status: in progress
+
+PDF Trio (ML Classification)
+==============================
+
+This document describes how we intent to integrate the first generation of PDF
+classification work into the sandcrawler processing system.
+
+- abstractions (APIs)
+- schemas
+- how models and dependencies are deployed
+- what code is release where under what license
+
+
+## Code Structure
+
+Major components:
+
+**Training code, documentation, datasets:** Not used at run-time (does not need
+to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we
+should only release URL lists that point to wayback.
+
+**Models:** all are static, uploaded to archive.org items, simple download to
+deploy. Should be versioned, and have unique versioned file names or directory
+paths (aka, deploy in parallel).
+
+**Image classifier backend:** vanilla tensorflow serving docker image, with a
+bunch of invocation configs, plus static models.
+
+**BERT backend:** vanilla tensorflow serving docker image, plus config, plus
+models. Basically same as image classifier.
+
+**API service:** currently Flask. Depends on tools like imagemagik, fasttext,
+pdftotext. Seems like apt+pipenv should work?
+
+
+## API Refactors
+
+Changes:
+
+- probably re-write README?
+- refactor python code into directories
+- add python tests
+- tweak schema
+- proper parallelization: uwsgi? async?
+
+New features:
+
+- option to send images, raw text in batches in addition to PDFs.
+
+## Client Code
+
+Basically just like GROBID client for now. Requests, JSON.
+
+## JSON Schema
+
+Output that goes in Kafka topic:
+
+    pdftrio
+        status
+        status_code
+        ensemble_score
+        bert_score
+        image_score
+        linear_score
+        versions
+            pdftrio_version (string)
+            models_date (string, ISO date)
+            git_rev (string)
+            bert_model (string)
+            image_model (string)
+            linear_model (string)
+        timing
+            ... (might be added?)
+    file_meta
+        sha1hex
+        ...
+
+
+## SQL Schema
+
+Ensemble model versions are summarized as a date.
+
+    CREATE TABLE IF NOT EXISTS pdftrio (
+        sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+        updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+        status_code         INT NOT NULL,
+        status              TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+        pdftrio_version     TEXT CHECK (octet_length(pdftrio_version) >= 1),
+        models_date         DATE,
+        ensemble_score      REAL,
+        bert_score          REAL,
+        linear_score        REAL,
+        image_score         REAL
+    );
+
+## Kafka Topic
+
+sandcrawler-qa.pdftrio-output
+
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 0b2b19c..a27796b 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -74,6 +74,19 @@ CREATE TABLE IF NOT EXISTS grobid (
 );
 -- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
 
+CREATE TABLE IF NOT EXISTS pdftrio (
+    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+    status_code         INT NOT NULL,
+    status              TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+    pdftrio_version     TEXT CHECK (octet_length(pdftrio_version) >= 1),
+    models_date         DATE,
+    ensemble_score      REAL,
+    bert_score          REAL,
+    linear_score        REAL,
+    image_score         REAL
+);
+
 CREATE TABLE IF NOT EXISTS ingest_request (
     link_source             TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
     link_source_id          TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
author	Bryan Newbold <bnewbold@archive.org>	2020-02-12 19:01:44 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-02-12 19:01:44 -0800
commit	c61cb13ae42e3a170c29d4710ea2fc484081ee96 (patch)
tree	7db2a372a72b69126341d04cc010a732b4cec46c
parent	c32d64f7a7b9e01ceb4c3dc161e0ab267cf63654 (diff)
download	sandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.tar.gz sandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.zip