aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler_worker.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-xpython/sandcrawler_worker.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index f13116a..02d075c 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -69,6 +69,21 @@ def run_persist_grobid(args):
)
pusher.run()
+def run_persist_pdftrio(args):
+ consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
+ worker = PersistPdfTrioWorker(
+ db_url=args.db_url,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-pdftrio",
+ push_batches=True,
+ batch_size=100,
+ )
+ pusher.run()
+
def run_ingest_file(args):
if args.bulk:
consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env)
@@ -158,6 +173,10 @@ def main():
help="only upload TEI-XML to S3 (don't write to database)")
sub_persist_grobid.set_defaults(func=run_persist_grobid)
+ sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
+ help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
+ sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)
+
sub_ingest_file = subparsers.add_parser('ingest-file',
help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
sub_ingest_file.add_argument('--bulk',