diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-01 17:54:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | da87108eecfd94e02d949a4fe4fc7998a489b934 (patch) | |
tree | 75b93602d3ec49615234ade1fb6d60abe0c21020 /python/sandcrawler_worker.py | |
parent | 59af5ddd0a9587eaf53b4f6965c0d6290295ce55 (diff) | |
download | sandcrawler-da87108eecfd94e02d949a4fe4fc7998a489b934.tar.gz sandcrawler-da87108eecfd94e02d949a4fe4fc7998a489b934.zip |
crossref persist: make GROBID ref parsing an option (not default)
Diffstat (limited to 'python/sandcrawler_worker.py')
-rwxr-xr-x | python/sandcrawler_worker.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 73bd444..c7eca86 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -298,12 +298,16 @@ def run_persist_ingest_file(args): def run_persist_crossref(args): grobid_client = GrobidClient(host_url=args.grobid_host) consume_topic = "fatcat-{}.api-crossref".format(args.env) - worker = PersistCrossrefWorker(db_url=args.db_url, grobid_client=grobid_client) + worker = PersistCrossrefWorker( + db_url=args.db_url, + grobid_client=grobid_client, + parse_refs=args.parse_refs, + ) pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-ingest", + group="persist-crossref", push_batches=True, # small batch size because doing GROBID processing batch_size=20, @@ -444,6 +448,11 @@ def main(): sub_persist_crossref.add_argument( "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" ) + sub_persist_crossref.add_argument( + "--parse-refs", + action="store_true", + help="use GROBID to parse any unstructured references (default is to not)", + ) sub_persist_crossref.set_defaults(func=run_persist_crossref) args = parser.parse_args() |