diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:45:47 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:45:47 -0700 |
commit | ff673bc6be7098efb5a6297d990955761bffc7e6 (patch) | |
tree | 91f36e66bc98c002eb18053b89c2f917523bf4e8 | |
parent | 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff) | |
download | grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip |
to_legacy_dict() helper, and start adding some new fields
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 8 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 2 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 20 | ||||
-rw-r--r-- | tests/files/small.json | 20 | ||||
-rw-r--r-- | tests/test_parse.py | 15 |
5 files changed, 30 insertions, 35 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index 7f455af..8946ab8 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -83,9 +83,9 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - keys = list(journal.keys()) # remove empty/null keys + keys = list(journal.keys()) for k in keys: if not journal[k]: journal.pop(k) @@ -140,6 +140,12 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["url"] = ref["url"].split(">")[0] else: ref["url"] = None + + # remove empty/null keys + keys = list(ref.keys()) + for k in keys: + if ref[k] is None: + ref.pop(k) return ref diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 029fa85..284ceff 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -161,7 +161,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_header(header), - # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, + pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index b86e1a4..9894bf5 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -72,7 +72,7 @@ class GrobidHeader: title: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - # TODO: note: Optional[str] + note: Optional[str] = None journal: Optional[GrobidJournal] = None @@ -80,8 +80,8 @@ class GrobidHeader: class GrobidDocument: grobid_version: str grobid_timestamp: str - # TODO: pdf_md5: Optional[str] header: GrobidHeader + pdf_md5: Optional[str] = None citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None abstract: Optional[str] = None @@ -100,6 +100,22 @@ class GrobidDocument: """ return _simplify_dict(asdict(self)) + def to_legacy_dict(self) -> dict: + """ + Returns a dict in the old "grobid2json" format. + """ + d = self.to_dict() + + # all header fields at top-level + d.update(d.pop('header', {})) + d.pop('note', None) + d.pop('pdf_md5', None) + for a in d['authors']: + addr = a.get('affiliation', {}).get('address') + if addr and addr.get('post_code'): + addr['postCode'] = addr.pop('post_code') + return d + def remove_encumbered(self) -> None: """ This helper function removes fields from this object which might raise diff --git a/tests/files/small.json b/tests/files/small.json index aa0da78..9dc8fba 100644 --- a/tests/files/small.json +++ b/tests/files/small.json @@ -27,34 +27,18 @@ "date": "2001", "id": "b0", "index": 0, - "issue": null, "journal": "Letters in the Alphabet", - "publisher": null, "title": "Everything is Wonderful", - "url": null, "volume": "20", - "unstructured": null, - "arxiv_id": null, - "doi": null, - "pages": "1-11", - "pmcid": null, - "pmid": null + "pages": "1-11" }, { "authors": [], "date": "2011-03-28", "id": "b1", "index": 1, - "issue": null, "journal": "The Dictionary", - "publisher": null, "title": "All about Facts", - "url": null, - "volume": "14", - "unstructured": null, - "arxiv_id": null, - "doi": null, - "pmcid": null, - "pmid": null + "volume": "14" } ], "abstract": "Everything you ever wanted to know about nothing", diff --git a/tests/test_parse.py b/tests/test_parse.py index 30b2926..825b561 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -75,25 +75,14 @@ def test_small_xml() -> None: assert doc == expected -def test_small_xml_json() -> None: +def test_small_xml_legacy() -> None: with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: json_form = json.loads(f.read()) - d = parse_document_xml(tei_xml).to_dict() - - # munge back to the old JSON format - d.update(d.pop('header')) - addr = d['authors'][0]['affiliation']['address'] - addr['postCode'] = addr.pop('post_code') - - # remove nulls from old JSON - for c in json_form['citations']: - for k in list(c.keys()): - if c[k] is None: - c.pop(k) + d = parse_document_xml(tei_xml).to_legacy_dict() assert d == json_form |