diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 19:27:03 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 19:27:31 -0800 |
commit | 2885b34ab3e4c862f9e895a237108d42793efb1d (patch) | |
tree | 2a0c197427d39e9809bd120df50afb06e707b9b0 /python/sandcrawler_worker.py | |
parent | 3adcaf9802928346dda597cefd4b66b2e62fa942 (diff) | |
download | sandcrawler-2885b34ab3e4c862f9e895a237108d42793efb1d.tar.gz sandcrawler-2885b34ab3e4c862f9e895a237108d42793efb1d.zip |
ingest: handle publishing XML docs to kafka
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 537398e..b62fa80 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -174,6 +174,7 @@ def run_ingest_file(args): grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env) thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) + xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env) sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=produce_topic, @@ -193,12 +194,17 @@ def run_ingest_file(args): kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic, ) + xmldoc_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, + produce_topic=xmldoc_topic, + ) worker = IngestFileWorker( grobid_client=grobid_client, sink=sink, grobid_sink=grobid_sink, thumbnail_sink=thumbnail_sink, pdftext_sink=pdftext_sink, + xmldoc_sink=xmldoc_sink, # don't SPNv2 for --bulk backfill try_spn2=not args.bulk, ) |