diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:09:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:09:59 -0700 |
commit | ee797ddc0a1377423cfe1939634e6d019eecea9e (patch) | |
tree | 6e8d343354573b1dbf41514d8da21cc0e71b8dfa /python/grobid2json.py | |
parent | 324e368c174b95e62a99b7a1f80c5e884d8053c8 (diff) | |
download | sandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.tar.gz sandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.zip |
grobid2json: language_code
Diffstat (limited to 'python/grobid2json.py')
-rwxr-xr-x | python/grobid2json.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index e5af9d3..ae31486 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -28,6 +28,7 @@ import json import argparse import xml.etree.ElementTree as ET +xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): @@ -143,6 +144,10 @@ def teixml2json(content, encumbered=True): refs.append(ref) info['citations'] = refs + text = tei.find('.//{%s}text' % (ns)) + print(text.attrib) + info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + if encumbered: el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) info['abstract'] = (el or None) and " ".join(el.itertext()).strip() |