diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 15:34:13 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-10 15:34:13 -0800 |
commit | e109996661a2b0f89e226b96e525c04dfdd952c1 (patch) | |
tree | f0a56ccc056ee91a6a0b893c9848601db47f3ee4 | |
parent | 357b1b07d071cce7d4cd2289ee3965018c89646c (diff) | |
download | sandcrawler-e109996661a2b0f89e226b96e525c04dfdd952c1.tar.gz sandcrawler-e109996661a2b0f89e226b96e525c04dfdd952c1.zip |
grobid_tool: helper to process a single file
-rwxr-xr-x | python/grobid_tool.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 782bc13..029cbf1 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -18,6 +18,13 @@ from sandcrawler import * from sandcrawler.grobid import CrossrefRefsWorker +def run_single(args): + grobid_client = GrobidClient(host_url=args.grobid_host) + resp = grobid_client.process_fulltext(blob=args.pdf_file.read()) + resp["_metadata"] = grobid_client.metadata(resp) + print(json.dumps(resp, sort_keys=True)) + + def run_extract_json(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() @@ -113,6 +120,14 @@ def main(): ) subparsers = parser.add_subparsers() + sub_single = subparsers.add_parser("single") + sub_single.set_defaults(func=run_single) + sub_single.add_argument( + "pdf_file", + help="path to PDF file to process", + type=argparse.FileType("rb"), + ) + sub_extract_json = subparsers.add_parser( "extract-json", help="for each JSON line with CDX info, fetches PDF and does GROBID extraction", |