From 33249f2679851afb64142c428be45d16f35f5539 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 20 Oct 2020 17:37:52 -0700 Subject: persist PDF extraction in ingest pipeline Ooof, didn't realize that this wasn't happening. Explains a lot of missing thumbnails in scholar! --- python/sandcrawler_worker.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 833b9c4..77c0704 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -51,11 +51,11 @@ def run_grobid_extract(args): def run_pdf_extract(args): consume_topic = "sandcrawler-{}.unextracted".format(args.env) - text_topic = "sandcrawler-{}.pdf-text".format(args.env) + pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env) thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) - text_sink = KafkaCompressSink( + pdftext_sink = KafkaCompressSink( kafka_hosts=args.kafka_hosts, - produce_topic=text_topic, + produce_topic=pdftext_topic, ) thumbnail_sink = KafkaSink( kafka_hosts=args.kafka_hosts, @@ -66,7 +66,7 @@ def run_pdf_extract(args): ) worker = PdfExtractWorker( wayback_client=wayback_client, - sink=text_sink, + sink=pdftext_sink, thumbnail_sink=thumbnail_sink, ) pusher = KafkaJsonPusher( @@ -172,6 +172,8 @@ def run_ingest_file(args): consume_topic = "sandcrawler-{}.ingest-file-requests".format(args.env) produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env) grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) + pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env) + thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=produce_topic, @@ -183,10 +185,20 @@ def run_ingest_file(args): grobid_client = GrobidClient( host_url=args.grobid_host, ) + pdftext_sink = KafkaCompressSink( + kafka_hosts=args.kafka_hosts, + produce_topic=pdftext_topic, + ) + thumbnail_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, + produce_topic=thumbnail_topic, + ) worker = IngestFileWorker( grobid_client=grobid_client, sink=sink, grobid_sink=grobid_sink, + thumbnail_sink=thumbnail_sink, + pdftext_sink=pdftext_sink, # don't SPNv2 for --bulk backfill try_spn2=not args.bulk, ) -- cgit v1.2.3