aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 12:46:06 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 12:46:06 -0700
commit5570de5a2f20fb654e21cce64687ce5e0d96eb7a (patch)
tree2f23627b6b33c2c1fedf56dc8984bbb4635be1cb
parent5ccc4e792a5b5b4f77db34eafe495c39da643120 (diff)
downloadsandcrawler-5570de5a2f20fb654e21cce64687ce5e0d96eb7a.tar.gz
sandcrawler-5570de5a2f20fb654e21cce64687ce5e0d96eb7a.zip
sandcrawler_worker: remove duplicate run_pdf_extract()
-rwxr-xr-xpython/sandcrawler_worker.py29
1 files changed, 0 insertions, 29 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 024358a..3b49cf2 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -99,35 +99,6 @@ def run_persist_grobid(args):
)
pusher.run()
-def run_pdf_extract(args):
- consume_topic = "sandcrawler-{}.unextracted".format(args.env)
- text_topic = "sandcrawler-{}.pdf-text".format(args.env)
- thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
- text_sink = KafkaSink(
- kafka_hosts=args.kafka_hosts,
- produce_topic=text_topic,
- )
- thumbnail_sink = KafkaSink(
- kafka_hosts=args.kafka_hosts,
- produce_topic=thumbnail_topic,
- )
- wayback_client = WaybackClient(
- host_url=args.grobid_host,
- )
- worker = PdfExtractWorker(
- wayback_client=wayback_client,
- sink=text_sink,
- thumbnail_sink=thumbnail_sink,
- )
- pusher = KafkaJsonPusher(
- worker=worker,
- kafka_hosts=args.kafka_hosts,
- consume_topic=consume_topic,
- group="pdf-extract",
- batch_size=1,
- )
- pusher.run()
-
def run_persist_pdftext(args):
consume_topic = "sandcrawler-{}.pdf-text".format(args.env)
worker = PersistPdfTextWorker(