diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 17:26:40 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:13:53 -0700 |
commit | 60f29aaa1efd90628c6a6dc503e23d694e0389ce (patch) | |
tree | 9f226c523386d50470b6ce4896dc6034be35301d /mapreduce/grobid2json.py | |
parent | c62cf313b4d3e3991aa45c09c39b1b92df2893b4 (diff) | |
download | sandcrawler-60f29aaa1efd90628c6a6dc503e23d694e0389ce.tar.gz sandcrawler-60f29aaa1efd90628c6a6dc503e23d694e0389ce.zip |
grobid2json test fixes
Diffstat (limited to 'mapreduce/grobid2json.py')
-rwxr-xr-x | mapreduce/grobid2json.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py index c1ff1f1..ca460f8 100755 --- a/mapreduce/grobid2json.py +++ b/mapreduce/grobid2json.py @@ -95,6 +95,8 @@ def teixml2json(content, encumbered=True): tei = tree.getroot() header = tei.find('.//{%s}teiHeader' % ns) + if header is None: + raise ValueError("XML does not look like TEI format") info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns))) info['journal'] = journal_info(header) |