diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 13:30:31 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 13:30:31 -0700 | 
| commit | be5a45fdf0622e6b65663d08f577aa41e0e89be0 (patch) | |
| tree | 90cb05290aeaf12e8405d21661c958037a8f4ac9 /python | |
| parent | 1c4b1bcd0384b655088028474bcbf13778f741c2 (diff) | |
| download | sandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.tar.gz sandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.zip  | |
s3-only mode persist workers use different consumer group
Diffstat (limited to 'python')
| -rwxr-xr-x | python/sandcrawler_worker.py | 10 | 
1 files changed, 8 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 3b49cf2..638dc35 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -89,11 +89,14 @@ def run_persist_grobid(args):          s3_only=args.s3_only,          db_only=args.db_only,      ) +    kafka_group = "persist-grobid" +    if args.s3_only: +        kafka_group += "-s3"      pusher = KafkaJsonPusher(          worker=worker,          kafka_hosts=args.kafka_hosts,          consume_topic=consume_topic, -        group="persist-grobid", +        group=kafka_group,          push_batches=True,          batch_size=25,      ) @@ -110,11 +113,14 @@ def run_persist_pdftext(args):          s3_only=args.s3_only,          db_only=args.db_only,      ) +    kafka_group = "persist-pdf-text" +    if args.s3_only: +        kafka_group += "-s3"      pusher = KafkaJsonPusher(          worker=worker,          kafka_hosts=args.kafka_hosts,          consume_topic=consume_topic, -        group="persist-pdf-text", +        group=kafka_group,          push_batches=True,          batch_size=25,      )  | 
