aboutsummaryrefslogtreecommitdiffstats
path: root/python/grobid_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-22 20:36:54 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-22 20:36:56 -0800
commit1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1 (patch)
treedabd84b5f03b5f8666f4e0d571b0c1e16cf4bb1e /python/grobid_tool.py
parent28de71e714c1f5d70adcfd3213dc2433a701a430 (diff)
downloadsandcrawler-1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1.tar.gz
sandcrawler-1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1.zip
commit grobid_tool transform mode
Had some stale code on aitio with this change I forgot to commit. Oops!
Diffstat (limited to 'python/grobid_tool.py')
-rwxr-xr-xpython/grobid_tool.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index a8d5120..ad7841d 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -10,9 +10,11 @@ Example of large parallel run, locally:
"""
import sys
+import json
import argparse
import datetime
+from grobid2json import teixml2json
from sandcrawler import *
@@ -49,6 +51,22 @@ def run_extract_zipfile(args):
pusher = ZipfilePusher(worker, args.zip_file)
pusher.run()
+def run_transform(args):
+ grobid_client = GrobidClient()
+ for line in args.json_file:
+ if not line.strip():
+ continue
+ line = json.loads(line)
+ if args.metadata_only:
+ out = grobid_client.metadata(line)
+ else:
+ out = teixml2json(line['tei_xml'])
+ if out:
+ if 'source' in line:
+ out['source'] = line['source']
+ print(json.dumps(out))
+
+
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -90,6 +108,15 @@ def main():
help="zipfile with PDFs to extract",
type=str)
+ sub_transform = subparsers.add_parser('transform')
+ sub_transform.set_defaults(func=run_transform)
+ sub_transform.add_argument('--metadata-only',
+ action='store_true',
+ help="Only pass through bibliographic metadata, not fulltext")
+ sub_transform.add_argument('json_file',
+ help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field",
+ type=argparse.FileType('r'))
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do!")