diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 11:20:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 11:20:15 -0700 |
commit | a9155b4bec34669552ac348da524154f93ee453a (patch) | |
tree | 6b366dae615727da23298fad8cd91e6be574a15b /python | |
parent | 2a7ef915ad83dbcd2b00fa211f210a80cd561f27 (diff) | |
download | sandcrawler-a9155b4bec34669552ac348da524154f93ee453a.tar.gz sandcrawler-a9155b4bec34669552ac348da524154f93ee453a.zip |
pdfextract worker
Diffstat (limited to 'python')
-rwxr-xr-x | python/sandcrawler_worker.py | 35 |
1 files changed, 34 insertions, 1 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index d85a995..024358a 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -49,6 +49,35 @@ def run_grobid_extract(args): ) pusher.run() +def run_pdf_extract(args): + consume_topic = "sandcrawler-{}.unextracted".format(args.env) + text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env) + thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env) + text_sink = KafkaCompressSink( + kafka_hosts=args.kafka_hosts, + produce_topic=text_topic, + ) + thumbnail_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, + produce_topic=thumbnail_topic, + ) + wayback_client = WaybackClient( + host_url=args.grobid_host, + ) + worker = PdfExtractWorker( + wayback_client=wayback_client, + sink=text_sink, + thumbnail_sink=thumbnail_sink, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="pdf-extract", + batch_size=1, + ) + pusher.run() + def run_persist_grobid(args): consume_topic = "sandcrawler-{}.grobid-output-pg".format(args.env) worker = PersistGrobidWorker( @@ -237,9 +266,13 @@ def main(): subparsers = parser.add_subparsers() sub_grobid_extract = subparsers.add_parser('grobid-extract', - help="daemon that consumes CDX JSON objects from Kafka, extracts, pushes to Kafka") + help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka") sub_grobid_extract.set_defaults(func=run_grobid_extract) + sub_pdf_extract = subparsers.add_parser('pdf-extract', + help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka") + sub_pdf_extract.set_defaults(func=run_pdf_extract) + sub_persist_grobid = subparsers.add_parser('persist-grobid', help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres") sub_persist_grobid.add_argument('--s3-only', |