diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:54:30 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:54:30 -0800 |
commit | 5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74 (patch) | |
tree | 99287be62661a4ea46b1235edafe1d87a56ce3e2 | |
parent | 889928e49c90776b0203d6612b6d229f9bb7725e (diff) | |
download | sandcrawler-5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74.tar.gz sandcrawler-5e169ce051884d2ebbdcbfde7cdc1d2b2efc4f74.zip |
grobid2json: make lang detection flexible
-rwxr-xr-x | python/grobid2json.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index 1f7270c..75fdcba 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -146,7 +146,8 @@ def teixml2json(content, encumbered=True): text = tei.find('.//{%s}text' % (ns)) #print(text.attrib) - info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + if text.attrib.get('{%s}lang' % xml_ns): + info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang if encumbered: el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) |