diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 18:51:08 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 18:51:08 -0700 |
commit | 5db075beaa55b2d619798154c06c2df625346972 (patch) | |
tree | 38897e99a26b751d3e93b1a2f5308ea6fa05eabb /mapreduce/grobid2json.py | |
parent | 77577da13afe07b5177452122f4cee77e3357b4e (diff) | |
download | sandcrawler-5db075beaa55b2d619798154c06c2df625346972.tar.gz sandcrawler-5db075beaa55b2d619798154c06c2df625346972.zip |
progress on extractor
Diffstat (limited to 'mapreduce/grobid2json.py')
-rwxr-xr-x | mapreduce/grobid2json.py | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py index daf9387..cc6eb2c 100755 --- a/mapreduce/grobid2json.py +++ b/mapreduce/grobid2json.py @@ -20,6 +20,7 @@ Prints JSON to stdout, errors to stderr """ import os +import io import sys import json import argparse @@ -73,11 +74,18 @@ def biblio_info(elem): return ref -def do_tei(path, encumbered=True): +def do_tei(content, encumbered=True): - info = dict(filename=os.path.basename(path)) + if type(content) == str: + content = io.StringIO(content) + elif type(content) == bytes: + content = io.BytesIO(content) - tree = ET.parse(path) + info = dict() + + #print(content) + #print(content.getvalue()) + tree = ET.parse(content) tei = tree.getroot() header = tei.find('.//{%s}teiHeader' % ns) @@ -109,7 +117,7 @@ def do_tei(path, encumbered=True): return info -def main(): +def main(): # pragma no cover parser = argparse.ArgumentParser( description="GROBID TEI XML to JSON", usage="%(prog)s [options] <teifile>...") @@ -121,9 +129,10 @@ def main(): args = parser.parse_args() for filename in args.teifiles: + content = open(filename, 'r') print(json.dumps( - do_tei(filename, + do_tei(content, encumbered=(not args.no_encumbered)))) -if __name__=='__main__': +if __name__=='__main__': # pragma no cover main() |