aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler_worker.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 19:28:21 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 19:28:21 -0800
commit7dc382519187e32856f494af76248fa93aef7770 (patch)
tree3cff23b1ebf689239993731b7d0546f94bac8414 /python/sandcrawler_worker.py
parent2885b34ab3e4c862f9e895a237108d42793efb1d (diff)
downloadsandcrawler-7dc382519187e32856f494af76248fa93aef7770.tar.gz
sandcrawler-7dc382519187e32856f494af76248fa93aef7770.zip
persist: XML and HTML persist workers
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-xpython/sandcrawler_worker.py47
1 files changed, 47 insertions, 0 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index b62fa80..24dbdd0 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -13,6 +13,7 @@ import datetime
import raven
from sandcrawler import *
+from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
try:
@@ -148,6 +149,44 @@ def run_persist_thumbnail(args):
)
pusher.run()
+def run_persist_xml_doc(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.xml-doc"
+ worker = PersistXmlDocWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-xml-doc",
+ push_batches=False,
+ raw_records=True,
+ batch_size=25,
+ )
+ pusher.run()
+
+def run_persist_html_teixml(args: argparse.Namespace) -> None:
+ consume_topic = f"sandcrawler-{args.env}.html-teixml"
+ worker = PersistHtmlTeiXmlWorker(
+ s3_url=args.s3_url,
+ s3_bucket=args.s3_bucket,
+ s3_access_key=args.s3_access_key,
+ s3_secret_key=args.s3_secret_key,
+ )
+ pusher = KafkaJsonPusher(
+ worker=worker,
+ kafka_hosts=args.kafka_hosts,
+ consume_topic=consume_topic,
+ group="persist-html-teixml",
+ push_batches=False,
+ raw_records=True,
+ batch_size=25,
+ )
+ pusher.run()
+
def run_persist_pdftrio(args):
consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env)
worker = PersistPdfTrioWorker(
@@ -293,6 +332,14 @@ def main():
help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")
sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)
+ sub_persist_xml_doc = subparsers.add_parser('persist-xml-doc',
+ help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc)
+
+ sub_persist_html_teixml = subparsers.add_parser('persist-html-teixml',
+ help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket")
+ sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml)
+
sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',
help="daemon that consumes pdftrio output from Kafka and pushes to postgres")
sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio)