aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/grobid2json.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-05 18:51:08 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-05 18:51:08 -0700
commit5db075beaa55b2d619798154c06c2df625346972 (patch)
tree38897e99a26b751d3e93b1a2f5308ea6fa05eabb /mapreduce/grobid2json.py
parent77577da13afe07b5177452122f4cee77e3357b4e (diff)
downloadsandcrawler-5db075beaa55b2d619798154c06c2df625346972.tar.gz
sandcrawler-5db075beaa55b2d619798154c06c2df625346972.zip
progress on extractor
Diffstat (limited to 'mapreduce/grobid2json.py')
-rwxr-xr-xmapreduce/grobid2json.py21
1 files changed, 15 insertions, 6 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index daf9387..cc6eb2c 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -20,6 +20,7 @@ Prints JSON to stdout, errors to stderr
"""
import os
+import io
import sys
import json
import argparse
@@ -73,11 +74,18 @@ def biblio_info(elem):
return ref
-def do_tei(path, encumbered=True):
+def do_tei(content, encumbered=True):
- info = dict(filename=os.path.basename(path))
+ if type(content) == str:
+ content = io.StringIO(content)
+ elif type(content) == bytes:
+ content = io.BytesIO(content)
- tree = ET.parse(path)
+ info = dict()
+
+ #print(content)
+ #print(content.getvalue())
+ tree = ET.parse(content)
tei = tree.getroot()
header = tei.find('.//{%s}teiHeader' % ns)
@@ -109,7 +117,7 @@ def do_tei(path, encumbered=True):
return info
-def main():
+def main(): # pragma no cover
parser = argparse.ArgumentParser(
description="GROBID TEI XML to JSON",
usage="%(prog)s [options] <teifile>...")
@@ -121,9 +129,10 @@ def main():
args = parser.parse_args()
for filename in args.teifiles:
+ content = open(filename, 'r')
print(json.dumps(
- do_tei(filename,
+ do_tei(content,
encumbered=(not args.no_encumbered))))
-if __name__=='__main__':
+if __name__=='__main__': # pragma no cover
main()