2 files changed, 20 insertions, 0 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 32c5d0f..bbe383f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -173,6 +173,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
         header=_parse_header(header),
+        # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
     )
 
     refs = []
@@ -183,6 +184,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
         refs.append(ref)
     doc.citations = refs
 
+
     text = tei.find(f".//{{{ns}}}text")
     # print(text.attrib)
     if text and text.attrib.get(f"{{{xml_ns}}}lang"):
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index aabe424..e6718c1 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -90,8 +90,26 @@ class GrobidDocument:
     annex: Optional[str] = None
 
     def to_dict(self) -> dict:
+        """
+        Returns a dict version of this object which has no 'None' fields
+        (recursively), and is appropriate for serializing to JSON with
+        json.dumps().
+
+        If you did want all the fields, you could use dataclasses.asdict()
+        directly on thing object.
+        """
         return _simplify_dict(asdict(self))
 
+    def remove_encumbered(self) -> None:
+        """
+        This helper function removes fields from this object which might raise
+        copyright concerns.
+        """
+        self.abstract = None
+        self.body = None
+        self.acknowledgement = None
+        self.annex = None
+
 
 def _simplify_dict(d: dict) -> dict:
     """