aboutsummaryrefslogtreecommitdiffstats
path: root/python/grobid_tool.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/grobid_tool.py')
-rwxr-xr-xpython/grobid_tool.py15
1 files changed, 15 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 782bc13..029cbf1 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -18,6 +18,13 @@ from sandcrawler import *
from sandcrawler.grobid import CrossrefRefsWorker
+def run_single(args):
+ grobid_client = GrobidClient(host_url=args.grobid_host)
+ resp = grobid_client.process_fulltext(blob=args.pdf_file.read())
+ resp["_metadata"] = grobid_client.metadata(resp)
+ print(json.dumps(resp, sort_keys=True))
+
+
def run_extract_json(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
@@ -113,6 +120,14 @@ def main():
)
subparsers = parser.add_subparsers()
+ sub_single = subparsers.add_parser("single")
+ sub_single.set_defaults(func=run_single)
+ sub_single.add_argument(
+ "pdf_file",
+ help="path to PDF file to process",
+ type=argparse.FileType("rb"),
+ )
+
sub_extract_json = subparsers.add_parser(
"extract-json",
help="for each JSON line with CDX info, fetches PDF and does GROBID extraction",