diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 19:28:21 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 19:28:21 -0800 |
commit | 7dc382519187e32856f494af76248fa93aef7770 (patch) | |
tree | 3cff23b1ebf689239993731b7d0546f94bac8414 /python/sandcrawler_worker.py | |
parent | 2885b34ab3e4c862f9e895a237108d42793efb1d (diff) | |
download | sandcrawler-7dc382519187e32856f494af76248fa93aef7770.tar.gz sandcrawler-7dc382519187e32856f494af76248fa93aef7770.zip |
persist: XML and HTML persist workers
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index b62fa80..24dbdd0 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -13,6 +13,7 @@ import datetime import raven from sandcrawler import * +from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: @@ -148,6 +149,44 @@ def run_persist_thumbnail(args): ) pusher.run() +def run_persist_xml_doc(args: argparse.Namespace) -> None: + consume_topic = f"sandcrawler-{args.env}.xml-doc" + worker = PersistXmlDocWorker( + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-xml-doc", + push_batches=False, + raw_records=True, + batch_size=25, + ) + pusher.run() + +def run_persist_html_teixml(args: argparse.Namespace) -> None: + consume_topic = f"sandcrawler-{args.env}.html-teixml" + worker = PersistHtmlTeiXmlWorker( + s3_url=args.s3_url, + s3_bucket=args.s3_bucket, + s3_access_key=args.s3_access_key, + s3_secret_key=args.s3_secret_key, + ) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-html-teixml", + push_batches=False, + raw_records=True, + batch_size=25, + ) + pusher.run() + def run_persist_pdftrio(args): consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env) worker = PersistPdfTrioWorker( @@ -293,6 +332,14 @@ def main(): help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres") sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail) + sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc', + help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket") + sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc) + + sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml', + help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket") + sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml) + sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio', help="daemon that consumes pdftrio output from Kafka and pushes to postgres") sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio) |