aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-10 17:26:40 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-10 19:13:53 -0700
commit60f29aaa1efd90628c6a6dc503e23d694e0389ce (patch)
tree9f226c523386d50470b6ce4896dc6034be35301d
parentc62cf313b4d3e3991aa45c09c39b1b92df2893b4 (diff)
downloadsandcrawler-60f29aaa1efd90628c6a6dc503e23d694e0389ce.tar.gz
sandcrawler-60f29aaa1efd90628c6a6dc503e23d694e0389ce.zip
grobid2json test fixes
-rwxr-xr-xmapreduce/grobid2json.py2
-rw-r--r--mapreduce/tests/test_grobid2json.py2
2 files changed, 3 insertions, 1 deletions
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index c1ff1f1..ca460f8 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -95,6 +95,8 @@ def teixml2json(content, encumbered=True):
tei = tree.getroot()
header = tei.find('.//{%s}teiHeader' % ns)
+ if header is None:
+ raise ValueError("XML does not look like TEI format")
info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
info['journal'] = journal_info(header)
diff --git a/mapreduce/tests/test_grobid2json.py b/mapreduce/tests/test_grobid2json.py
index 1562006..8497b10 100644
--- a/mapreduce/tests/test_grobid2json.py
+++ b/mapreduce/tests/test_grobid2json.py
@@ -18,5 +18,5 @@ def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
teixml2json("this is not XML")
- with pytest.raises(xml.etree.ElementTree.ParseError):
+ with pytest.raises(ValueError):
teixml2json("<xml></xml>")