diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 18:10:00 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-04 18:10:00 -0800 |
commit | de71aa92d4c7c9d14dfccc0188032d4e7b10090f (patch) | |
tree | 45e231fab99d4e5f576323dae8734ae71568c8f7 /python/sandcrawler_worker.py | |
parent | 2fdba24da0e0bf3d300cfb959514bf57a3cf6701 (diff) | |
download | sandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.tar.gz sandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.zip |
html: actually publish HTML TEI-XML to body; fix dataflow though ingest a bit
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 3681d7f..6be8bac 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -212,6 +212,7 @@ def run_ingest_file(args): pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env) thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env) + htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env) sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=produce_topic, @@ -235,6 +236,10 @@ def run_ingest_file(args): kafka_hosts=args.kafka_hosts, produce_topic=xmldoc_topic, ) + htmlteixml_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, + produce_topic=htmlteixml_topic, + ) worker = IngestFileWorker( grobid_client=grobid_client, sink=sink, @@ -242,6 +247,7 @@ def run_ingest_file(args): thumbnail_sink=thumbnail_sink, pdftext_sink=pdftext_sink, xmldoc_sink=xmldoc_sink, + htmlteixml_sink=htmlteixml_sink, # don't SPNv2 for --bulk backfill try_spn2=not args.bulk, ) |