aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/parse.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:04 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:04 -0700
commit45deea74f80d1e8deed6076f2a93d711d16a3a83 (patch)
tree3e94769cdb5e1bdeb9c5c985a561e5b7a880be83 /grobid_tei_xml/parse.py
parent2bf52b0622005ed8a7c51e59faa9873600d9cb5f (diff)
downloadgrobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.tar.gz
grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.zip
some docs and prep for including MD5 in output
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-xgrobid_tei_xml/parse.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 32c5d0f..bbe383f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
header=_parse_header(header),
+ # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
)
refs = []
@@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
refs.append(ref)
doc.citations = refs
+
text = tei.find(f".//{{{ns}}}text")
# print(text.attrib)
if text and text.attrib.get(f"{{{xml_ns}}}lang"):