diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 2 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 15 |
2 files changed, 11 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c912af7..8f7220d 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -181,7 +181,7 @@ class IngestFileRequestHandler(BaseHTTPRequestHandler): length = int(self.headers.get('content-length')) request = json.loads(self.rfile.read(length).decode('utf-8')) print("Got request: {}".format(request)) - ingester = FileIngestWorker() + ingester = IngestFileWorker() result = ingester.process(request) self.send_response(200) self.end_headers() diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 27d6ebd..1000228 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -21,15 +21,20 @@ def run_grobid_extract(args): consume_topic = "sandcrawler-{}.ungrobided-pg".format(args.env) produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) sink = KafkaSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic) - worker = GrobidWorker(host_url=args.grobid_host, sink=sink) - pusher = KafkaJsonPusher(sink=worker, group="grobid-extract") + grobid_client = GrobidClient(host_url=args.grobid_host) + wayback_client = WaybackClient(host_url=args.grobid_host) + worker = GrobidWorker(grobid_client=grobid_client, wayback_client=wayback_client, sink=sink) + pusher = KafkaJsonPusher(worker=worker, kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, group="grobid-extract") pusher.run() def run_grobid_persist(args): consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) - sink = GrobidPersist(consume_topic=consume_topic) - pusher = KafkaJsonPusher(sink) - pusher.run() + raise NotImplementedError + #worker = GrobidPersistWorker() + #pusher = KafkaJsonPusher(worker=worker, kafka_hosts=args.kafka_hosts, + # consume_topic=consume_topic, group="grobid-persist") + #pusher.run() def run_ingest_file(args): consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env) |