aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-12 19:01:44 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-12 19:01:44 -0800
commitc61cb13ae42e3a170c29d4710ea2fc484081ee96 (patch)
tree7db2a372a72b69126341d04cc010a732b4cec46c
parentc32d64f7a7b9e01ceb4c3dc161e0ab267cf63654 (diff)
downloadsandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.tar.gz
sandcrawler-c61cb13ae42e3a170c29d4710ea2fc484081ee96.zip
pdftrio proposal and start on schema+kafka
-rw-r--r--kafka/topics.md8
-rw-r--r--proposals/20200207_pdftrio.md101
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql13
3 files changed, 122 insertions, 0 deletions
diff --git a/kafka/topics.md b/kafka/topics.md
index 2735d51..0ce8610 100644
--- a/kafka/topics.md
+++ b/kafka/topics.md
@@ -41,6 +41,12 @@ retention (on both a size and time basis).
=> 6 partitions
=> can't think of a good key, so none; no compaction
+ sandcrawler-ENV.pdftrio-output
+ => output of each pdftrio ML classification
+ => schema is JSON; see pdftrio proposal for fields. small objects.
+ => 6 partitions
+ => key is sha1hex of PDF; enable key compaction
+
fatcat-ENV.api-crossref
fatcat-ENV.api-datacite
=> all new and updated DOIs (regardless of type)
@@ -119,6 +125,8 @@ exists`; this seems safe, and the settings won't be over-ridden.
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.ingest-file-requests-bulk --config retention.ms=7889400000 --config cleanup.policy=delete
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.ingest-file-results
+ ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.pdftrio-output --config cleanup.policy=compact
+
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 1 --topic fatcat-qa.changelog
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.release-updates-v03
./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates
diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md
new file mode 100644
index 0000000..b1b09f9
--- /dev/null
+++ b/proposals/20200207_pdftrio.md
@@ -0,0 +1,101 @@
+
+status: in progress
+
+PDF Trio (ML Classification)
+==============================
+
+This document describes how we intent to integrate the first generation of PDF
+classification work into the sandcrawler processing system.
+
+- abstractions (APIs)
+- schemas
+- how models and dependencies are deployed
+- what code is release where under what license
+
+
+## Code Structure
+
+Major components:
+
+**Training code, documentation, datasets:** Not used at run-time (does not need
+to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we
+should only release URL lists that point to wayback.
+
+**Models:** all are static, uploaded to archive.org items, simple download to
+deploy. Should be versioned, and have unique versioned file names or directory
+paths (aka, deploy in parallel).
+
+**Image classifier backend:** vanilla tensorflow serving docker image, with a
+bunch of invocation configs, plus static models.
+
+**BERT backend:** vanilla tensorflow serving docker image, plus config, plus
+models. Basically same as image classifier.
+
+**API service:** currently Flask. Depends on tools like imagemagik, fasttext,
+pdftotext. Seems like apt+pipenv should work?
+
+
+## API Refactors
+
+Changes:
+
+- probably re-write README?
+- refactor python code into directories
+- add python tests
+- tweak schema
+- proper parallelization: uwsgi? async?
+
+New features:
+
+- option to send images, raw text in batches in addition to PDFs.
+
+## Client Code
+
+Basically just like GROBID client for now. Requests, JSON.
+
+## JSON Schema
+
+Output that goes in Kafka topic:
+
+ pdftrio
+ status
+ status_code
+ ensemble_score
+ bert_score
+ image_score
+ linear_score
+ versions
+ pdftrio_version (string)
+ models_date (string, ISO date)
+ git_rev (string)
+ bert_model (string)
+ image_model (string)
+ linear_model (string)
+ timing
+ ... (might be added?)
+ file_meta
+ sha1hex
+ ...
+
+
+## SQL Schema
+
+Ensemble model versions are summarized as a date.
+
+ CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+ );
+
+## Kafka Topic
+
+sandcrawler-qa.pdftrio-output
+
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 0b2b19c..a27796b 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -74,6 +74,19 @@ CREATE TABLE IF NOT EXISTS grobid (
);
-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+CREATE TABLE IF NOT EXISTS pdftrio (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1),
+ models_date DATE,
+ ensemble_score REAL,
+ bert_score REAL,
+ linear_score REAL,
+ image_score REAL
+);
+
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),