aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 17:02:11 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 17:02:11 -0800
commit506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0 (patch)
treeeee49ec31feb30049ac08b1f6f2904d492ec1f94 /python
parent818a936be9480bb75e40d7e3723aed3ac8c1eee9 (diff)
downloadsandcrawler-506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0.tar.gz
sandcrawler-506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0.zip
grobid-to-kafka support in ingest worker
Diffstat (limited to 'python')
-rwxr-xr-xpython/sandcrawler_worker.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 26735ce..9199874 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -67,16 +67,22 @@ def run_persist_grobid(args):
def run_ingest_file(args):
consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env)
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic,
)
+ grobid_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=grobid_topic,
+ )
grobid_client = GrobidClient(
host_url=args.grobid_host,
)
worker = IngestFileWorker(
grobid_client=grobid_client,
sink=sink,
+ grobid_sink=grobid_sink,
)
pusher = KafkaJsonPusher(
worker=worker,