From 1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 22 Dec 2019 20:36:54 -0800 Subject: commit grobid_tool transform mode Had some stale code on aitio with this change I forgot to commit. Oops! --- python/grobid_tool.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/grobid_tool.py b/python/grobid_tool.py index a8d5120..ad7841d 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -10,9 +10,11 @@ Example of large parallel run, locally: """ import sys +import json import argparse import datetime +from grobid2json import teixml2json from sandcrawler import * @@ -49,6 +51,22 @@ def run_extract_zipfile(args): pusher = ZipfilePusher(worker, args.zip_file) pusher.run() +def run_transform(args): + grobid_client = GrobidClient() + for line in args.json_file: + if not line.strip(): + continue + line = json.loads(line) + if args.metadata_only: + out = grobid_client.metadata(line) + else: + out = teixml2json(line['tei_xml']) + if out: + if 'source' in line: + out['source'] = line['source'] + print(json.dumps(out)) + + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -90,6 +108,15 @@ def main(): help="zipfile with PDFs to extract", type=str) + sub_transform = subparsers.add_parser('transform') + sub_transform.set_defaults(func=run_transform) + sub_transform.add_argument('--metadata-only', + action='store_true', + help="Only pass through bibliographic metadata, not fulltext") + sub_transform.add_argument('json_file', + help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field", + type=argparse.FileType('r')) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") -- cgit v1.2.3