small grobid2json test

author: Bryan Newbold <bnewbold@archive.org> 2018-04-06 12:39:49 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-04-06 12:39:49 -0700
commit: 114c6b611148d2ff499bcea302eee0eca00df647 (patch)
tree: df929050d3aa9484f78e5c1807bc951ce1e85512 /mapreduce/grobid2json.py
parent: e68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
download: sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz
sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip
1 files changed, 10 insertions, 2 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index cc6eb2c..52a3125 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
 
 """
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
 This script tries to extract everything from a GROBID TEI XML fulltext dump:
 
 - header metadata
@@ -38,6 +42,8 @@ def journal_info(elem):
     journal = dict()
     journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
     journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if journal['publisher'] == '':
+        journal['publisher'] = None
     journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
     journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
     journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -59,6 +65,8 @@ def biblio_info(elem):
             ref['title'] = other_title
     ref['authors'] = all_authors(elem)
     ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if ref['publisher'] == '':
+        ref['publisher'] = None
     date = elem.find('.//{%s}date[@type="published"]' % ns)
     ref['date'] = (date != None) and date.attrib.get('when')
     ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -74,7 +82,7 @@ def biblio_info(elem):
     return ref
 
 
-def do_tei(content, encumbered=True):
+def teixml2json(content, encumbered=True):
 
     if type(content) == str:
         content = io.StringIO(content)
@@ -131,7 +139,7 @@ def main():   # pragma no cover
     for filename in args.teifiles:
         content = open(filename, 'r')
         print(json.dumps(
-            do_tei(content,
+            teixml2json(content,
                encumbered=(not args.no_encumbered))))
 
 if __name__=='__main__':   # pragma no cover
author	Bryan Newbold <bnewbold@archive.org>	2018-04-06 12:39:49 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-04-06 12:39:49 -0700
commit	114c6b611148d2ff499bcea302eee0eca00df647 (patch)
tree	df929050d3aa9484f78e5c1807bc951ce1e85512 /mapreduce/grobid2json.py
parent	e68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
download	sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip