diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:02:11 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 17:02:11 -0800 |
commit | 506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0 (patch) | |
tree | eee49ec31feb30049ac08b1f6f2904d492ec1f94 | |
parent | 818a936be9480bb75e40d7e3723aed3ac8c1eee9 (diff) | |
download | sandcrawler-506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0.tar.gz sandcrawler-506fae13dbd111f89b4ae96c25cbd24ac1ec3ff0.zip |
grobid-to-kafka support in ingest worker
-rwxr-xr-x | python/sandcrawler_worker.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 26735ce..9199874 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -67,16 +67,22 @@ def run_persist_grobid(args): def run_ingest_file(args): consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env) produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env) + grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=produce_topic, ) + grobid_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, + produce_topic=grobid_topic, + ) grobid_client = GrobidClient( host_url=args.grobid_host, ) worker = IngestFileWorker( grobid_client=grobid_client, sink=sink, + grobid_sink=grobid_sink, ) pusher = KafkaJsonPusher( worker=worker, |