diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 19:59:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 19:59:04 -0700 |
commit | 45deea74f80d1e8deed6076f2a93d711d16a3a83 (patch) | |
tree | 3e94769cdb5e1bdeb9c5c985a561e5b7a880be83 | |
parent | 2bf52b0622005ed8a7c51e59faa9873600d9cb5f (diff) | |
download | grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.tar.gz grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.zip |
some docs and prep for including MD5 in output
-rwxr-xr-x | grobid_tei_xml/parse.py | 2 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 18 |
2 files changed, 20 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 32c5d0f..bbe383f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_header(header), + # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] @@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: refs.append(ref) doc.citations = refs + text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index aabe424..e6718c1 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -90,8 +90,26 @@ class GrobidDocument: annex: Optional[str] = None def to_dict(self) -> dict: + """ + Returns a dict version of this object which has no 'None' fields + (recursively), and is appropriate for serializing to JSON with + json.dumps(). + + If you did want all the fields, you could use dataclasses.asdict() + directly on thing object. + """ return _simplify_dict(asdict(self)) + def remove_encumbered(self) -> None: + """ + This helper function removes fields from this object which might raise + copyright concerns. + """ + self.abstract = None + self.body = None + self.acknowledgement = None + self.annex = None + def _simplify_dict(d: dict) -> dict: """ |