aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 14:09:59 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 14:09:59 -0700
commitee797ddc0a1377423cfe1939634e6d019eecea9e (patch)
tree6e8d343354573b1dbf41514d8da21cc0e71b8dfa
parent324e368c174b95e62a99b7a1f80c5e884d8053c8 (diff)
downloadsandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.tar.gz
sandcrawler-ee797ddc0a1377423cfe1939634e6d019eecea9e.zip
grobid2json: language_code
-rwxr-xr-xpython/grobid2json.py5
-rw-r--r--python/tests/files/small.json3
2 files changed, 7 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index e5af9d3..ae31486 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -28,6 +28,7 @@ import json
import argparse
import xml.etree.ElementTree as ET
+xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
def all_authors(elem):
@@ -143,6 +144,10 @@ def teixml2json(content, encumbered=True):
refs.append(ref)
info['citations'] = refs
+ text = tei.find('.//{%s}text' % (ns))
+ print(text.attrib)
+ info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
+
if encumbered:
el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index eb93d9e..35bf62d 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -56,5 +56,6 @@
"annex": null,
"fatcat_release": null,
"grobid_timestamp": "2018-04-02T00:31+0000",
- "grobid_version": "0.5.1-SNAPSHOT"
+ "grobid_version": "0.5.1-SNAPSHOT",
+ "language_code": "en"
}