aboutsummaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-22 13:45:47 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-22 13:45:47 -0700
commitff673bc6be7098efb5a6297d990955761bffc7e6 (patch)
tree91f36e66bc98c002eb18053b89c2f917523bf4e8 /tests
parent8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff)
downloadgrobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz
grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip
to_legacy_dict() helper, and start adding some new fields
Diffstat (limited to 'tests')
-rw-r--r--tests/files/small.json20
-rw-r--r--tests/test_parse.py15
2 files changed, 4 insertions, 31 deletions
diff --git a/tests/files/small.json b/tests/files/small.json
index aa0da78..9dc8fba 100644
--- a/tests/files/small.json
+++ b/tests/files/small.json
@@ -27,34 +27,18 @@
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
"title": "Everything is Wonderful",
- "url": null,
"volume": "20",
- "unstructured": null,
- "arxiv_id": null,
- "doi": null,
- "pages": "1-11",
- "pmcid": null,
- "pmid": null
+ "pages": "1-11"
},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
- "volume": "14",
- "unstructured": null,
- "arxiv_id": null,
- "doi": null,
- "pmcid": null,
- "pmid": null
+ "volume": "14"
}
],
"abstract": "Everything you ever wanted to know about nothing",
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 30b2926..825b561 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -75,25 +75,14 @@ def test_small_xml() -> None:
assert doc == expected
-def test_small_xml_json() -> None:
+def test_small_xml_legacy() -> None:
with open('tests/files/small.xml', 'r') as f:
tei_xml = f.read()
with open('tests/files/small.json', 'r') as f:
json_form = json.loads(f.read())
- d = parse_document_xml(tei_xml).to_dict()
-
- # munge back to the old JSON format
- d.update(d.pop('header'))
- addr = d['authors'][0]['affiliation']['address']
- addr['postCode'] = addr.pop('post_code')
-
- # remove nulls from old JSON
- for c in json_form['citations']:
- for k in list(c.keys()):
- if c[k] is None:
- c.pop(k)
+ d = parse_document_xml(tei_xml).to_legacy_dict()
assert d == json_form