diff options
Diffstat (limited to 'mapreduce/grobid2json.py')
-rwxr-xr-x | mapreduce/grobid2json.py | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py index daf9387..cc6eb2c 100755 --- a/mapreduce/grobid2json.py +++ b/mapreduce/grobid2json.py @@ -20,6 +20,7 @@ Prints JSON to stdout, errors to stderr """ import os +import io import sys import json import argparse @@ -73,11 +74,18 @@ def biblio_info(elem): return ref -def do_tei(path, encumbered=True): +def do_tei(content, encumbered=True): - info = dict(filename=os.path.basename(path)) + if type(content) == str: + content = io.StringIO(content) + elif type(content) == bytes: + content = io.BytesIO(content) - tree = ET.parse(path) + info = dict() + + #print(content) + #print(content.getvalue()) + tree = ET.parse(content) tei = tree.getroot() header = tei.find('.//{%s}teiHeader' % ns) @@ -109,7 +117,7 @@ def do_tei(path, encumbered=True): return info -def main(): +def main(): # pragma no cover parser = argparse.ArgumentParser( description="GROBID TEI XML to JSON", usage="%(prog)s [options] <teifile>...") @@ -121,9 +129,10 @@ def main(): args = parser.parse_args() for filename in args.teifiles: + content = open(filename, 'r') print(json.dumps( - do_tei(filename, + do_tei(content, encumbered=(not args.no_encumbered)))) -if __name__=='__main__': +if __name__=='__main__': # pragma no cover main() |