diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 18:35:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 |
commit | 8577d50b644dd45bce5275675eed4d43bb816b67 (patch) | |
tree | 105f570ba6e9a394ef5e2a86edbfa5864054eb22 | |
parent | 4315b44a93ca31725b9b0a2a55c310725ac55efe (diff) | |
download | sandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.tar.gz sandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.zip |
crossref persist: batch size depends on whether parsing refs
-rwxr-xr-x | python/persist_tool.py | 5 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 5 |
2 files changed, 8 insertions, 2 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py index 5cf5776..069bef7 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -120,6 +120,9 @@ def run_ingest_request(args): def run_crossref(args): + batch_size = 200 + if args.parse_refs: + batch_size = 10 grobid_client = GrobidClient( host_url=args.grobid_host, ) @@ -131,7 +134,7 @@ def run_crossref(args): pusher = JsonLinePusher( worker, args.json_file, - batch_size=10, + batch_size=batch_size, ) pusher.run() diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index c7eca86..52d126a 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -296,6 +296,9 @@ def run_persist_ingest_file(args): def run_persist_crossref(args): + batch_size = 200 + if args.parse_refs: + batch_size = 10 grobid_client = GrobidClient(host_url=args.grobid_host) consume_topic = "fatcat-{}.api-crossref".format(args.env) worker = PersistCrossrefWorker( @@ -310,7 +313,7 @@ def run_persist_crossref(args): group="persist-crossref", push_batches=True, # small batch size because doing GROBID processing - batch_size=20, + batch_size=batch_size, ) pusher.run() |