to_legacy_dict() helper, and start adding some new fields

author: Bryan Newbold <bnewbold@archive.org> 2021-10-22 13:45:47 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-22 13:45:47 -0700
commit: ff673bc6be7098efb5a6297d990955761bffc7e6 (patch)
tree: 91f36e66bc98c002eb18053b89c2f917523bf4e8
parent: 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff)
download: grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz
grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip
5 files changed, 30 insertions, 35 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index 7f455af..8946ab8 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -83,9 +83,9 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]:
     journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
     journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
     journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
-    keys = list(journal.keys())
 
     # remove empty/null keys
+    keys = list(journal.keys())
     for k in keys:
         if not journal[k]:
             journal.pop(k)
@@ -140,6 +140,12 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]:
             ref["url"] = ref["url"].split(">")[0]
     else:
         ref["url"] = None
+
+    # remove empty/null keys
+    keys = list(ref.keys())
+    for k in keys:
+        if ref[k] is None:
+            ref.pop(k)
     return ref
 
 
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 029fa85..284ceff 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -161,7 +161,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
         header=_parse_header(header),
-        # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
+        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
     )
 
     refs = []
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index b86e1a4..9894bf5 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -72,7 +72,7 @@ class GrobidHeader:
     title: Optional[str] = None
     date: Optional[str] = None
     doi: Optional[str] = None
-    # TODO: note: Optional[str]
+    note: Optional[str] = None
     journal: Optional[GrobidJournal] = None
 
 
@@ -80,8 +80,8 @@ class GrobidHeader:
 class GrobidDocument:
     grobid_version: str
     grobid_timestamp: str
-    # TODO: pdf_md5: Optional[str]
     header: GrobidHeader
+    pdf_md5: Optional[str] = None
     citations: Optional[List[GrobidCitation]] = None
     language_code: Optional[str] = None
     abstract: Optional[str] = None
@@ -100,6 +100,22 @@ class GrobidDocument:
         """
         return _simplify_dict(asdict(self))
 
+    def to_legacy_dict(self) -> dict:
+        """
+        Returns a dict in the old "grobid2json" format.
+        """
+        d = self.to_dict()
+
+        # all header fields at top-level
+        d.update(d.pop('header', {}))
+        d.pop('note', None)
+        d.pop('pdf_md5', None)
+        for a in d['authors']:
+            addr = a.get('affiliation', {}).get('address')
+            if addr and addr.get('post_code'):
+                addr['postCode'] = addr.pop('post_code')
+        return d
+
     def remove_encumbered(self) -> None:
         """
         This helper function removes fields from this object which might raise
diff --git a/tests/files/small.json b/tests/files/small.json
index aa0da78..9dc8fba 100644
--- a/tests/files/small.json
+++ b/tests/files/small.json
@@ -27,34 +27,18 @@
       "date": "2001",
       "id": "b0",
       "index": 0,
-      "issue": null,
       "journal": "Letters in the Alphabet",
-      "publisher": null,
       "title": "Everything is Wonderful",
-      "url": null,
       "volume": "20",
-      "unstructured": null,
-      "arxiv_id": null,
-      "doi": null,
-      "pages": "1-11",
-      "pmcid": null,
-      "pmid": null
+      "pages": "1-11"
     },
     { "authors": [],
       "date": "2011-03-28",
       "id": "b1",
       "index": 1,
-      "issue": null,
       "journal": "The Dictionary",
-      "publisher": null,
       "title": "All about Facts",
-      "url": null,
-      "volume": "14",
-      "unstructured": null,
-      "arxiv_id": null,
-      "doi": null,
-      "pmcid": null,
-      "pmid": null
+      "volume": "14"
     }
   ],
   "abstract": "Everything you ever wanted to know about nothing",
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 30b2926..825b561 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -75,25 +75,14 @@ def test_small_xml() -> None:
     assert doc == expected
 
 
-def test_small_xml_json() -> None:
+def test_small_xml_legacy() -> None:
 
     with open('tests/files/small.xml', 'r') as f:
         tei_xml = f.read()
     with open('tests/files/small.json', 'r') as f:
         json_form = json.loads(f.read())
 
-    d = parse_document_xml(tei_xml).to_dict()
-
-    # munge back to the old JSON format
-    d.update(d.pop('header'))
-    addr = d['authors'][0]['affiliation']['address']
-    addr['postCode'] = addr.pop('post_code')
-
-    # remove nulls from old JSON
-    for c in json_form['citations']:
-        for k in list(c.keys()):
-            if c[k] is None:
-                c.pop(k)
+    d = parse_document_xml(tei_xml).to_legacy_dict()
 
     assert d == json_form
author	Bryan Newbold <bnewbold@archive.org>	2021-10-22 13:45:47 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-22 13:45:47 -0700
commit	ff673bc6be7098efb5a6297d990955761bffc7e6 (patch)
tree	91f36e66bc98c002eb18053b89c2f917523bf4e8
parent	8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff)
download	grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip