diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-29 18:36:53 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | a859fddb227872ce52f06af1dd9fb80987f348c4 (patch) | |
tree | ced078e2d563feed196fdf00c33cec39a8b42031 /python/sandcrawler_worker.py | |
parent | 16f4b7f45ae8bdcd4018850efe164ed19069e9fe (diff) | |
download | sandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.tar.gz sandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.zip |
glue, utils, and worker code for crossref and grobid_refs
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 33 |
1 files changed, 31 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index d42cd8c..73bd444 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -12,7 +12,11 @@ import sys import raven from sandcrawler import * -from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker +from sandcrawler.persist import ( + PersistCrossrefWorker, + PersistHtmlTeiXmlWorker, + PersistXmlDocWorker, +) # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: @@ -291,6 +295,22 @@ def run_persist_ingest_file(args): pusher.run() +def run_persist_crossref(args): + grobid_client = GrobidClient(host_url=args.grobid_host) + consume_topic = "fatcat-{}.api-crossref".format(args.env) + worker = PersistCrossrefWorker(db_url=args.db_url, grobid_client=grobid_client) + pusher = KafkaJsonPusher( + worker=worker, + kafka_hosts=args.kafka_hosts, + consume_topic=consume_topic, + group="persist-ingest", + push_batches=True, + # small batch size because doing GROBID processing + batch_size=20, + ) + pusher.run() + + def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( @@ -302,7 +322,7 @@ def main(): "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" ) parser.add_argument( - "--grobid-host", default="http://grobid.qa.fatcat.wiki", help="GROBID API host/port" + "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" ) parser.add_argument( "--db-url", @@ -417,6 +437,15 @@ def main(): ) sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file) + sub_persist_crossref = subparsers.add_parser( + "persist-crossref", + help="daemon that persists crossref to postgres; also does GROBID ref transform", + ) + sub_persist_crossref.add_argument( + "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" + ) + sub_persist_crossref.set_defaults(func=run_persist_crossref) + args = parser.parse_args() if not args.__dict__.get("func"): parser.print_help(file=sys.stderr) |