diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 13:30:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 13:30:31 -0700 |
commit | be5a45fdf0622e6b65663d08f577aa41e0e89be0 (patch) | |
tree | 90cb05290aeaf12e8405d21661c958037a8f4ac9 /python | |
parent | 1c4b1bcd0384b655088028474bcbf13778f741c2 (diff) | |
download | sandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.tar.gz sandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.zip |
s3-only mode persist workers use different consumer group
Diffstat (limited to 'python')
-rwxr-xr-x | python/sandcrawler_worker.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 3b49cf2..638dc35 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -89,11 +89,14 @@ def run_persist_grobid(args): s3_only=args.s3_only, db_only=args.db_only, ) + kafka_group = "persist-grobid" + if args.s3_only: + kafka_group += "-s3" pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-grobid", + group=kafka_group, push_batches=True, batch_size=25, ) @@ -110,11 +113,14 @@ def run_persist_pdftext(args): s3_only=args.s3_only, db_only=args.db_only, ) + kafka_group = "persist-pdf-text" + if args.s3_only: + kafka_group += "-s3" pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-pdf-text", + group=kafka_group, push_batches=True, batch_size=25, ) |