diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 19:59:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 19:59:04 -0700 |
commit | 45deea74f80d1e8deed6076f2a93d711d16a3a83 (patch) | |
tree | 3e94769cdb5e1bdeb9c5c985a561e5b7a880be83 /grobid_tei_xml/parse.py | |
parent | 2bf52b0622005ed8a7c51e59faa9873600d9cb5f (diff) | |
download | grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.tar.gz grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.zip |
some docs and prep for including MD5 in output
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 32c5d0f..bbe383f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_header(header), + # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] @@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: refs.append(ref) doc.citations = refs + text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): |