summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:04 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:04 -0700
commit45deea74f80d1e8deed6076f2a93d711d16a3a83 (patch)
tree3e94769cdb5e1bdeb9c5c985a561e5b7a880be83
parent2bf52b0622005ed8a7c51e59faa9873600d9cb5f (diff)
downloadgrobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.tar.gz
grobid_tei_xml-45deea74f80d1e8deed6076f2a93d711d16a3a83.zip
some docs and prep for including MD5 in output
-rwxr-xr-xgrobid_tei_xml/parse.py2
-rw-r--r--grobid_tei_xml/types.py18
2 files changed, 20 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 32c5d0f..bbe383f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
header=_parse_header(header),
+ # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
)
refs = []
@@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
refs.append(ref)
doc.citations = refs
+
text = tei.find(f".//{{{ns}}}text")
# print(text.attrib)
if text and text.attrib.get(f"{{{xml_ns}}}lang"):
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index aabe424..e6718c1 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -90,8 +90,26 @@ class GrobidDocument:
annex: Optional[str] = None
def to_dict(self) -> dict:
+ """
+ Returns a dict version of this object which has no 'None' fields
+ (recursively), and is appropriate for serializing to JSON with
+ json.dumps().
+
+ If you did want all the fields, you could use dataclasses.asdict()
+ directly on thing object.
+ """
return _simplify_dict(asdict(self))
+ def remove_encumbered(self) -> None:
+ """
+ This helper function removes fields from this object which might raise
+ copyright concerns.
+ """
+ self.abstract = None
+ self.body = None
+ self.acknowledgement = None
+ self.annex = None
+
def _simplify_dict(d: dict) -> dict:
"""