aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 20:54:30 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 20:54:30 -0800
commit5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74 (patch)
tree99287be62661a4ea46b1235edafe1d87a56ce3e2 /python
parent889928e49c90776b0203d6612b6d229f9bb7725e (diff)
downloadsandcrawler-5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74.tar.gz
sandcrawler-5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74.zip
grobid2json: make lang detection flexible
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid2json.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index 1f7270c..75fdcba 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -146,7 +146,8 @@ def teixml2json(content, encumbered=True):
text = tei.find('.//{%s}text' % (ns))
#print(text.attrib)
- info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
+ if text.attrib.get('{%s}lang' % xml_ns):
+ info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
if encumbered:
el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))