diff options
Diffstat (limited to 'tests/test_parse.py')
-rw-r--r-- | tests/test_parse.py | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("<xml></xml>") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" |