diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-22 20:36:54 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-12-22 20:36:56 -0800 |
commit | 1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1 (patch) | |
tree | dabd84b5f03b5f8666f4e0d571b0c1e16cf4bb1e /python | |
parent | 28de71e714c1f5d70adcfd3213dc2433a701a430 (diff) | |
download | sandcrawler-1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1.tar.gz sandcrawler-1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1.zip |
commit grobid_tool transform mode
Had some stale code on aitio with this change I forgot to commit. Oops!
Diffstat (limited to 'python')
-rwxr-xr-x | python/grobid_tool.py | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index a8d5120..ad7841d 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -10,9 +10,11 @@ Example of large parallel run, locally: """ import sys +import json import argparse import datetime +from grobid2json import teixml2json from sandcrawler import * @@ -49,6 +51,22 @@ def run_extract_zipfile(args): pusher = ZipfilePusher(worker, args.zip_file) pusher.run() +def run_transform(args): + grobid_client = GrobidClient() + for line in args.json_file: + if not line.strip(): + continue + line = json.loads(line) + if args.metadata_only: + out = grobid_client.metadata(line) + else: + out = teixml2json(line['tei_xml']) + if out: + if 'source' in line: + out['source'] = line['source'] + print(json.dumps(out)) + + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -90,6 +108,15 @@ def main(): help="zipfile with PDFs to extract", type=str) + sub_transform = subparsers.add_parser('transform') + sub_transform.set_defaults(func=run_transform) + sub_transform.add_argument('--metadata-only', + action='store_true', + help="Only pass through bibliographic metadata, not fulltext") + sub_transform.add_argument('json_file', + help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field", + type=argparse.FileType('r')) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") |