aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 13:30:31 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 13:30:31 -0700
commitbe5a45fdf0622e6b65663d08f577aa41e0e89be0 (patch)
tree90cb05290aeaf12e8405d21661c958037a8f4ac9
parent1c4b1bcd0384b655088028474bcbf13778f741c2 (diff)
downloadsandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.tar.gz
sandcrawler-be5a45fdf0622e6b65663d08f577aa41e0e89be0.zip
s3-only mode persist workers use different consumer group
-rwxr-xr-xpython/sandcrawler_worker.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 3b49cf2..638dc35 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -89,11 +89,14 @@ def run_persist_grobid(args):
s3_only=args.s3_only,
db_only=args.db_only,
)
+ kafka_group = "persist-grobid"
+ if args.s3_only:
+ kafka_group += "-s3"
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-grobid",
+ group=kafka_group,
push_batches=True,
batch_size=25,
)
@@ -110,11 +113,14 @@ def run_persist_pdftext(args):
s3_only=args.s3_only,
db_only=args.db_only,
)
+ kafka_group = "persist-pdf-text"
+ if args.s3_only:
+ kafka_group += "-s3"
pusher = KafkaJsonPusher(
worker=worker,
kafka_hosts=args.kafka_hosts,
consume_topic=consume_topic,
- group="persist-pdf-text",
+ group=kafka_group,
push_batches=True,
batch_size=25,
)