From 45deea74f80d1e8deed6076f2a93d711d16a3a83 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Oct 2021 19:59:04 -0700 Subject: some docs and prep for including MD5 in output --- grobid_tei_xml/parse.py | 2 ++ grobid_tei_xml/types.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 32c5d0f..bbe383f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_header(header), + # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] @@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: refs.append(ref) doc.citations = refs + text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index aabe424..e6718c1 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -90,8 +90,26 @@ class GrobidDocument: annex: Optional[str] = None def to_dict(self) -> dict: + """ + Returns a dict version of this object which has no 'None' fields + (recursively), and is appropriate for serializing to JSON with + json.dumps(). + + If you did want all the fields, you could use dataclasses.asdict() + directly on thing object. + """ return _simplify_dict(asdict(self)) + def remove_encumbered(self) -> None: + """ + This helper function removes fields from this object which might raise + copyright concerns. + """ + self.abstract = None + self.body = None + self.acknowledgement = None + self.annex = None + def _simplify_dict(d: dict) -> dict: """ -- cgit v1.2.3