From ee797ddc0a1377423cfe1939634e6d019eecea9e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 4 Oct 2019 14:09:59 -0700 Subject: grobid2json: language_code --- python/grobid2json.py | 5 +++++ python/tests/files/small.json | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/grobid2json.py b/python/grobid2json.py index e5af9d3..ae31486 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -28,6 +28,7 @@ import json import argparse import xml.etree.ElementTree as ET +xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" def all_authors(elem): @@ -143,6 +144,10 @@ def teixml2json(content, encumbered=True): refs.append(ref) info['citations'] = refs + text = tei.find('.//{%s}text' % (ns)) + print(text.attrib) + info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + if encumbered: el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) info['abstract'] = (el or None) and " ".join(el.itertext()).strip() diff --git a/python/tests/files/small.json b/python/tests/files/small.json index eb93d9e..35bf62d 100644 --- a/python/tests/files/small.json +++ b/python/tests/files/small.json @@ -56,5 +56,6 @@ "annex": null, "fatcat_release": null, "grobid_timestamp": "2018-04-02T00:31+0000", - "grobid_version": "0.5.1-SNAPSHOT" + "grobid_version": "0.5.1-SNAPSHOT", + "language_code": "en" } -- cgit v1.2.3