aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 14:28:40 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 14:28:40 -0700
commitab1f3fe806e40122a93c975d2253f2c14035952e (patch)
tree30d565fa83d1f324bf8cb4d30b908a3d02c9c2bc
parent1a9df830d128cbd0bf80ff585785e226a6cb9019 (diff)
downloadsandcrawler-ab1f3fe806e40122a93c975d2253f2c14035952e.tar.gz
sandcrawler-ab1f3fe806e40122a93c975d2253f2c14035952e.zip
grobid-output-pg, not grobid-output-json
-rwxr-xr-xpython/grobid_tool.py6
1 files changed, 2 insertions, 4 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 9af0ab2..e787cdf 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -6,9 +6,7 @@ might go to stdout, or might go to Kafka topic.
Example of large parallel run, locally:
- cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json \
- | parallel -j6 --pipe \
- ./grobid_tool.py --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j10 extract-json -
+ cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json | pv -l | parallel -j30 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
"""
import sys
@@ -93,7 +91,7 @@ def main():
args.sink = None
if args.kafka_mode:
- produce_topic = "sandcrawler-{}.grobid-output-json".format(args.kafka_env)
+ produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
args.sink = KafkaGrobidSink(kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic)