From c61cb13ae42e3a170c29d4710ea2fc484081ee96 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:01:44 -0800 Subject: pdftrio proposal and start on schema+kafka --- proposals/20200207_pdftrio.md | 101 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 proposals/20200207_pdftrio.md (limited to 'proposals') diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md new file mode 100644 index 0000000..b1b09f9 --- /dev/null +++ b/proposals/20200207_pdftrio.md @@ -0,0 +1,101 @@ + +status: in progress + +PDF Trio (ML Classification) +============================== + +This document describes how we intent to integrate the first generation of PDF +classification work into the sandcrawler processing system. + +- abstractions (APIs) +- schemas +- how models and dependencies are deployed +- what code is release where under what license + + +## Code Structure + +Major components: + +**Training code, documentation, datasets:** Not used at run-time (does not need +to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we +should only release URL lists that point to wayback. + +**Models:** all are static, uploaded to archive.org items, simple download to +deploy. Should be versioned, and have unique versioned file names or directory +paths (aka, deploy in parallel). + +**Image classifier backend:** vanilla tensorflow serving docker image, with a +bunch of invocation configs, plus static models. + +**BERT backend:** vanilla tensorflow serving docker image, plus config, plus +models. Basically same as image classifier. + +**API service:** currently Flask. Depends on tools like imagemagik, fasttext, +pdftotext. Seems like apt+pipenv should work? + + +## API Refactors + +Changes: + +- probably re-write README? +- refactor python code into directories +- add python tests +- tweak schema +- proper parallelization: uwsgi? async? + +New features: + +- option to send images, raw text in batches in addition to PDFs. + +## Client Code + +Basically just like GROBID client for now. Requests, JSON. + +## JSON Schema + +Output that goes in Kafka topic: + + pdftrio + status + status_code + ensemble_score + bert_score + image_score + linear_score + versions + pdftrio_version (string) + models_date (string, ISO date) + git_rev (string) + bert_model (string) + image_model (string) + linear_model (string) + timing + ... (might be added?) + file_meta + sha1hex + ... + + +## SQL Schema + +Ensemble model versions are summarized as a date. + + CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL + ); + +## Kafka Topic + +sandcrawler-qa.pdftrio-output + -- cgit v1.2.3