diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 18:35:44 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-04 17:19:52 -0700 | 
| commit | 8577d50b644dd45bce5275675eed4d43bb816b67 (patch) | |
| tree | 105f570ba6e9a394ef5e2a86edbfa5864054eb22 /python | |
| parent | 4315b44a93ca31725b9b0a2a55c310725ac55efe (diff) | |
| download | sandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.tar.gz sandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.zip | |
crossref persist: batch size depends on whether parsing refs
Diffstat (limited to 'python')
| -rwxr-xr-x | python/persist_tool.py | 5 | ||||
| -rwxr-xr-x | python/sandcrawler_worker.py | 5 | 
2 files changed, 8 insertions, 2 deletions
| diff --git a/python/persist_tool.py b/python/persist_tool.py index 5cf5776..069bef7 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -120,6 +120,9 @@ def run_ingest_request(args):  def run_crossref(args): +    batch_size = 200 +    if args.parse_refs: +        batch_size = 10      grobid_client = GrobidClient(          host_url=args.grobid_host,      ) @@ -131,7 +134,7 @@ def run_crossref(args):      pusher = JsonLinePusher(          worker,          args.json_file, -        batch_size=10, +        batch_size=batch_size,      )      pusher.run() diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index c7eca86..52d126a 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -296,6 +296,9 @@ def run_persist_ingest_file(args):  def run_persist_crossref(args): +    batch_size = 200 +    if args.parse_refs: +        batch_size = 10      grobid_client = GrobidClient(host_url=args.grobid_host)      consume_topic = "fatcat-{}.api-crossref".format(args.env)      worker = PersistCrossrefWorker( @@ -310,7 +313,7 @@ def run_persist_crossref(args):          group="persist-crossref",          push_batches=True,          # small batch size because doing GROBID processing -        batch_size=20, +        batch_size=batch_size,      )      pusher.run() | 
