diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:40:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:42:43 -0800 |
commit | 94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch) | |
tree | af7803bee388beba7dd6dce2113e3632284537ac /python/sandcrawler_worker.py | |
parent | 6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff) | |
download | sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip |
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index f13116a..02d075c 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -69,6 +69,21 @@ def run_persist_grobid(args): ) pusher.run() +def run_persist_pdftrio(args): + consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env) + worker = PersistPdfTrioWorker( + db_url=args.db_url, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-pdftrio", + push_batches=True, + batch_size=100, + ) + pusher.run() + def run_ingest_file(args): if args.bulk: consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env) @@ -158,6 +173,10 @@ def main(): help="only upload TEI-XML to S3 (don't write to database)") sub_persist_grobid.set_defaults(func=run_persist_grobid) + sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio', + help="daemon that consumes pdftrio output from Kafka and pushes to postgres") + sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio) + sub_ingest_file = subparsers.add_parser('ingest-file', help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka") sub_ingest_file.add_argument('--bulk', |