aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-03 16:38:28 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-07 19:10:23 -0800
commit5c82ee1b965e1f3901294c752d8b2d24c6bdc974 (patch)
treec8e34aa3fcef0f064f2b9e6ea4bbb1b767e85dea
parent57441fda8be33594898c1836fba22b12fb3e94e8 (diff)
downloadsandcrawler-5c82ee1b965e1f3901294c752d8b2d24c6bdc974.tar.gz
sandcrawler-5c82ee1b965e1f3901294c752d8b2d24c6bdc974.zip
ingest tool: allow configuration of GROBID endpoint
-rwxr-xr-xpython/ingest_tool.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index 60a59d2..1843e0b 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -27,9 +27,13 @@ def run_single_ingest(args):
ingest_file_result_stdout=True,
)
else:
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
ingester = IngestFileWorker(
try_spn2=not args.no_spn2,
html_quick_mode=args.html_quick_mode,
+ grobid_client=grobid_client,
)
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
@@ -140,6 +144,9 @@ def main():
help="don't fetch individual sub-resources, just use CDX",
)
sub_single.add_argument("url", help="URL of paper to fetch")
+ sub_single.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
sub_requests = subparsers.add_parser(
"requests", help="takes a series of ingest requests (JSON, per line) and runs each"