From 7dc382519187e32856f494af76248fa93aef7770 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 19:28:21 -0800 Subject: persist: XML and HTML persist workers --- python/sandcrawler_worker.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'python/sandcrawler_worker.py') diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index b62fa80..24dbdd0 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -13,6 +13,7 @@ import datetime import raven from sandcrawler import * +from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: @@ -148,6 +149,44 @@ def run_persist_thumbnail(args): ) pusher.run() +def run_persist_xml_doc(args: argparse.Namespace) -> None: + consume_topic = f"sandcrawler-{args.env}.xml-doc" + worker = PersistXmlDocWorker( + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-xml-doc", + push_batches=False, + raw_records=True, + batch_size=25, + ) + pusher.run() + +def run_persist_html_teixml(args: argparse.Namespace) -> None: + consume_topic = f"sandcrawler-{args.env}.html-teixml" + worker = PersistHtmlTeiXmlWorker( + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-html-teixml", + push_batches=False, + raw_records=True, + batch_size=25, + ) + pusher.run() + def run_persist_pdftrio(args): consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env) worker = PersistPdfTrioWorker( @@ -293,6 +332,14 @@ def main(): help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres") sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail) + sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc', + help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket") + sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc) + + sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml', + help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket") + sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml) + sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio', help="daemon that consumes pdftrio output from Kafka and pushes to postgres") sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio) -- cgit v1.2.3