aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-03 18:35:44 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commit8577d50b644dd45bce5275675eed4d43bb816b67 (patch)
tree105f570ba6e9a394ef5e2a86edbfa5864054eb22 /python
parent4315b44a93ca31725b9b0a2a55c310725ac55efe (diff)
downloadsandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.tar.gz
sandcrawler-8577d50b644dd45bce5275675eed4d43bb816b67.zip
crossref persist: batch size depends on whether parsing refs
Diffstat (limited to 'python')
-rwxr-xr-xpython/persist_tool.py5
-rwxr-xr-xpython/sandcrawler_worker.py5
2 files changed, 8 insertions, 2 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 5cf5776..069bef7 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -120,6 +120,9 @@ def run_ingest_request(args):
def run_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
grobid_client = GrobidClient(
host_url=args.grobid_host,
)
@@ -131,7 +134,7 @@ def run_crossref(args):
pusher = JsonLinePusher(
worker,
args.json_file,
- batch_size=10,
+ batch_size=batch_size,
)
pusher.run()
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index c7eca86..52d126a 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -296,6 +296,9 @@ def run_persist_ingest_file(args):
def run_persist_crossref(args):
+ batch_size = 200
+ if args.parse_refs:
+ batch_size = 10
grobid_client = GrobidClient(host_url=args.grobid_host)
consume_topic = "fatcat-{}.api-crossref".format(args.env)
worker = PersistCrossrefWorker(
@@ -310,7 +313,7 @@ def run_persist_crossref(args):
group="persist-crossref",
push_batches=True,
# small batch size because doing GROBID processing
- batch_size=20,
+ batch_size=batch_size,
)
pusher.run()