diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 12:46:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 12:46:06 -0700 |
commit | 5570de5a2f20fb654e21cce64687ce5e0d96eb7a (patch) | |
tree | 2f23627b6b33c2c1fedf56dc8984bbb4635be1cb | |
parent | 5ccc4e792a5b5b4f77db34eafe495c39da643120 (diff) | |
download | sandcrawler-5570de5a2f20fb654e21cce64687ce5e0d96eb7a.tar.gz sandcrawler-5570de5a2f20fb654e21cce64687ce5e0d96eb7a.zip |
sandcrawler_worker: remove duplicate run_pdf_extract()
-rwxr-xr-x | python/sandcrawler_worker.py | 29 |
1 files changed, 0 insertions, 29 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 024358a..3b49cf2 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -99,35 +99,6 @@ def run_persist_grobid(args): ) pusher.run() -def run_pdf_extract(args): - consume_topic = "sandcrawler-{}.unextracted".format(args.env) - text_topic = "sandcrawler-{}.pdf-text".format(args.env) - thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) - text_sink = KafkaSink( - kafka_hosts=args.kafka_hosts, - produce_topic=text_topic, - ) - thumbnail_sink = KafkaSink( - kafka_hosts=args.kafka_hosts, - produce_topic=thumbnail_topic, - ) - wayback_client = WaybackClient( - host_url=args.grobid_host, - ) - worker = PdfExtractWorker( - wayback_client=wayback_client, - sink=text_sink, - thumbnail_sink=thumbnail_sink, - ) - pusher = KafkaJsonPusher( - worker=worker, - kafka_hosts=args.kafka_hosts, - consume_topic=consume_topic, - group="pdf-extract", - batch_size=1, - ) - pusher.run() - def run_persist_pdftext(args): consume_topic = "sandcrawler-{}.pdf-text".format(args.env) worker = PersistPdfTextWorker( |