diff options
-rwxr-xr-x | grobid_tei_xml/parse.py | 2 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 18 |
2 files changed, 20 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 32c5d0f..bbe383f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_header(header), + # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] @@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: refs.append(ref) doc.citations = refs + text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index aabe424..e6718c1 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -90,8 +90,26 @@ class GrobidDocument: annex: Optional[str] = None def to_dict(self) -> dict: + """ + Returns a dict version of this object which has no 'None' fields + (recursively), and is appropriate for serializing to JSON with + json.dumps(). + + If you did want all the fields, you could use dataclasses.asdict() + directly on thing object. + """ return _simplify_dict(asdict(self)) + def remove_encumbered(self) -> None: + """ + This helper function removes fields from this object which might raise + copyright concerns. + """ + self.abstract = None + self.body = None + self.acknowledgement = None + self.annex = None + def _simplify_dict(d: dict) -> dict: """ |