aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler_worker.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 18:49:07 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 18:49:07 -0800
commit148a163dba6a27866893b01c441e7e856429d797 (patch)
treefdd02a09fab21d56c7c99e3e72cc5b730a452c46 /python/sandcrawler_worker.py
parentb63289720ba8d09621d900d20dadfe69a41cd287 (diff)
downloadsandcrawler-148a163dba6a27866893b01c441e7e856429d797.tar.gz
sandcrawler-148a163dba6a27866893b01c441e7e856429d797.zip
fix lint errors
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-xpython/sandcrawler_worker.py15
1 files changed, 10 insertions, 5 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 27d6ebd..1000228 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -21,15 +21,20 @@ def run_grobid_extract(args):
consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env)
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic)
- worker = GrobidWorker(host_url=args.grobid_host, sink=sink)
- pusher = KafkaJsonPusher(sink=worker, group="grobid-extract")
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ wayback_client = WaybackClient(host_url=args.grobid_host)
+ worker = GrobidWorker(grobid_client=grobid_client, wayback_client=wayback_client, sink=sink)
+ pusher = KafkaJsonPusher(worker=worker, kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic, group="grobid-extract")
pusher.run()
def run_grobid_persist(args):
consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
- sink = GrobidPersist(consume_topic=consume_topic)
- pusher = KafkaJsonPusher(sink)
- pusher.run()
+ raise NotImplementedError
+ #worker = GrobidPersistWorker()
+ #pusher = KafkaJsonPusher(worker=worker, kafka_hosts=args.kafka_hosts,
+ # consume_topic=consume_topic, group="grobid-persist")
+ #pusher.run()
def run_ingest_file(args):
consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)