From 2bf52b0622005ed8a7c51e59faa9873600d9cb5f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Oct 2021 18:22:12 -0700 Subject: more progress --- tests/test_parse.py | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 tests/test_parse.py (limited to 'tests/test_parse.py') diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" -- cgit v1.2.3