aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/grobid2json.py5
-rw-r--r--python/tests/files/small.json3
2 files changed, 7 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index e5af9d3..ae31486 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -28,6 +28,7 @@ import json
import argparse
import xml.etree.ElementTree as ET
+xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
def all_authors(elem):
@@ -143,6 +144,10 @@ def teixml2json(content, encumbered=True):
refs.append(ref)
info['citations'] = refs
+ text = tei.find('.//{%s}text' % (ns))
+ print(text.attrib)
+ info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang
+
if encumbered:
el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
diff --git a/python/tests/files/small.json b/python/tests/files/small.json
index eb93d9e..35bf62d 100644
--- a/python/tests/files/small.json
+++ b/python/tests/files/small.json
@@ -56,5 +56,6 @@
"annex": null,
"fatcat_release": null,
"grobid_timestamp": "2018-04-02T00:31+0000",
- "grobid_version": "0.5.1-SNAPSHOT"
+ "grobid_version": "0.5.1-SNAPSHOT",
+ "language_code": "en"
}