diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:09:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:09:59 -0700 |
commit | ee797ddc0a1377423cfe1939634e6d019eecea9e (patch) | |
tree | 6e8d343354573b1dbf41514d8da21cc0e71b8dfa | |
parent | 324e368c174b95e62a99b7a1f80c5e884d8053c8 (diff) | |
download | sandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.tar.gz sandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.zip |
grobid2json: language_code
-rwxr-xr-x | python/grobid2json.py | 5 | ||||
-rw-r--r-- | python/tests/files/small.json | 3 |
2 files changed, 7 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index e5af9d3..ae31486 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -28,6 +28,7 @@ import json import argparse import xml.etree.ElementTree as ET +xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): @@ -143,6 +144,10 @@ def teixml2json(content, encumbered=True): refs.append(ref) info['citations'] = refs + text = tei.find('.//{%s}text' % (ns)) + print(text.attrib) + info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + if encumbered: el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) info['abstract'] = (el or None) and " ".join(el.itertext()).strip() diff --git a/python/tests/files/small.json b/python/tests/files/small.json index eb93d9e..35bf62d 100644 --- a/python/tests/files/small.json +++ b/python/tests/files/small.json @@ -56,5 +56,6 @@ "annex": null, "fatcat_release": null, "grobid_timestamp": "2018-04-02T00:31+0000", - "grobid_version": "0.5.1-SNAPSHOT" + "grobid_version": "0.5.1-SNAPSHOT", + "language_code": "en" } |