aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler_worker.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:00 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:00 -0800
commitde71aa92d4c7c9d14dfccc0188032d4e7b10090f (patch)
tree45e231fab99d4e5f576323dae8734ae71568c8f7 /python/sandcrawler_worker.py
parent2fdba24da0e0bf3d300cfb959514bf57a3cf6701 (diff)
downloadsandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.tar.gz
sandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.zip
html: actually publish HTML TEI-XML to body; fix dataflow though ingest a bit
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-xpython/sandcrawler_worker.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 3681d7f..6be8bac 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -212,6 +212,7 @@ def run_ingest_file(args):
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic,
@@ -235,6 +236,10 @@ def run_ingest_file(args):
kafka_hosts=args.kafka_hosts,
produce_topic=xmldoc_topic,
)
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
worker = IngestFileWorker(
grobid_client=grobid_client,
sink=sink,
@@ -242,6 +247,7 @@ def run_ingest_file(args):
thumbnail_sink=thumbnail_sink,
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
# don't SPNv2 for --bulk backfill
try_spn2=not args.bulk,
)