diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-10-29 18:36:53 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 | 
| commit | a859fddb227872ce52f06af1dd9fb80987f348c4 (patch) | |
| tree | ced078e2d563feed196fdf00c33cec39a8b42031 /python/sandcrawler_worker.py | |
| parent | 16f4b7f45ae8bdcd4018850efe164ed19069e9fe (diff) | |
| download | sandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.tar.gz sandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.zip  | |
glue, utils, and worker code for crossref and grobid_refs
Diffstat (limited to 'python/sandcrawler_worker.py')
| -rwxr-xr-x | python/sandcrawler_worker.py | 33 | 
1 files changed, 31 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index d42cd8c..73bd444 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -12,7 +12,11 @@ import sys  import raven  from sandcrawler import * -from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker +from sandcrawler.persist import ( +    PersistCrossrefWorker, +    PersistHtmlTeiXmlWorker, +    PersistXmlDocWorker, +)  # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable  try: @@ -291,6 +295,22 @@ def run_persist_ingest_file(args):      pusher.run() +def run_persist_crossref(args): +    grobid_client = GrobidClient(host_url=args.grobid_host) +    consume_topic = "fatcat-{}.api-crossref".format(args.env) +    worker = PersistCrossrefWorker(db_url=args.db_url, grobid_client=grobid_client) +    pusher = KafkaJsonPusher( +        worker=worker, +        kafka_hosts=args.kafka_hosts, +        consume_topic=consume_topic, +        group="persist-ingest", +        push_batches=True, +        # small batch size because doing GROBID processing +        batch_size=20, +    ) +    pusher.run() + +  def main():      parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)      parser.add_argument( @@ -302,7 +322,7 @@ def main():          "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"      )      parser.add_argument( -        "--grobid-host", default="http://grobid.qa.fatcat.wiki", help="GROBID API host/port" +        "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"      )      parser.add_argument(          "--db-url", @@ -417,6 +437,15 @@ def main():      )      sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file) +    sub_persist_crossref = subparsers.add_parser( +        "persist-crossref", +        help="daemon that persists crossref to postgres; also does GROBID ref transform", +    ) +    sub_persist_crossref.add_argument( +        "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" +    ) +    sub_persist_crossref.set_defaults(func=run_persist_crossref) +      args = parser.parse_args()      if not args.__dict__.get("func"):          parser.print_help(file=sys.stderr)  | 
