diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:45:47 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:45:47 -0700 |
commit | ff673bc6be7098efb5a6297d990955761bffc7e6 (patch) | |
tree | 91f36e66bc98c002eb18053b89c2f917523bf4e8 /tests | |
parent | 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff) | |
download | grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip |
to_legacy_dict() helper, and start adding some new fields
Diffstat (limited to 'tests')
-rw-r--r-- | tests/files/small.json | 20 | ||||
-rw-r--r-- | tests/test_parse.py | 15 |
2 files changed, 4 insertions, 31 deletions
diff --git a/tests/files/small.json b/tests/files/small.json index aa0da78..9dc8fba 100644 --- a/tests/files/small.json +++ b/tests/files/small.json @@ -27,34 +27,18 @@ "date": "2001", "id": "b0", "index": 0, - "issue": null, "journal": "Letters in the Alphabet", - "publisher": null, "title": "Everything is Wonderful", - "url": null, "volume": "20", - "unstructured": null, - "arxiv_id": null, - "doi": null, - "pages": "1-11", - "pmcid": null, - "pmid": null + "pages": "1-11" }, { "authors": [], "date": "2011-03-28", "id": "b1", "index": 1, - "issue": null, "journal": "The Dictionary", - "publisher": null, "title": "All about Facts", - "url": null, - "volume": "14", - "unstructured": null, - "arxiv_id": null, - "doi": null, - "pmcid": null, - "pmid": null + "volume": "14" } ], "abstract": "Everything you ever wanted to know about nothing", diff --git a/tests/test_parse.py b/tests/test_parse.py index 30b2926..825b561 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -75,25 +75,14 @@ def test_small_xml() -> None: assert doc == expected -def test_small_xml_json() -> None: +def test_small_xml_legacy() -> None: with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: json_form = json.loads(f.read()) - d = parse_document_xml(tei_xml).to_dict() - - # munge back to the old JSON format - d.update(d.pop('header')) - addr = d['authors'][0]['affiliation']['address'] - addr['postCode'] = addr.pop('post_code') - - # remove nulls from old JSON - for c in json_form['citations']: - for k in list(c.keys()): - if c[k] is None: - c.pop(k) + d = parse_document_xml(tei_xml).to_legacy_dict() assert d == json_form |