diff options
Diffstat (limited to 'python/grobid_tool.py')
-rwxr-xr-x | python/grobid_tool.py | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index a8d5120..ad7841d 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -10,9 +10,11 @@ Example of large parallel run, locally: """ import sys +import json import argparse import datetime +from grobid2json import teixml2json from sandcrawler import * @@ -49,6 +51,22 @@ def run_extract_zipfile(args): pusher = ZipfilePusher(worker, args.zip_file) pusher.run() +def run_transform(args): + grobid_client = GrobidClient() + for line in args.json_file: + if not line.strip(): + continue + line = json.loads(line) + if args.metadata_only: + out = grobid_client.metadata(line) + else: + out = teixml2json(line['tei_xml']) + if out: + if 'source' in line: + out['source'] = line['source'] + print(json.dumps(out)) + + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -90,6 +108,15 @@ def main(): help="zipfile with PDFs to extract", type=str) + sub_transform = subparsers.add_parser('transform') + sub_transform.set_defaults(func=run_transform) + sub_transform.add_argument('--metadata-only', + action='store_true', + help="Only pass through bibliographic metadata, not fulltext") + sub_transform.add_argument('json_file', + help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field", + type=argparse.FileType('r')) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") |