diff options
Diffstat (limited to 'tests/test_parse.py')
| -rw-r--r-- | tests/test_parse.py | 213 | 
1 files changed, 213 insertions, 0 deletions
diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + +    with open('tests/files/small.xml', 'r') as f: +        tei_xml = f.read() + +    doc = parse_document_xml(tei_xml) +    expected = GrobidDocument( +        grobid_version='0.5.1-SNAPSHOT', +        grobid_timestamp='2018-04-02T00:31+0000', +        language_code='en', +        header=GrobidHeader( +            title="Dummy Example File", +            authors=[ +                GrobidAuthor( +                    name="Brewster Kahle", +                    given_name="Brewster", +                    surname="Kahle", +                    affiliation=GrobidAffiliation( +                        department="Faculty ofAgricultrial Engineering", +                        laboratory="Plant Physiology Laboratory", +                        institution="Technion-Israel Institute of Technology", +                        address=GrobidAddress( +                            post_code="32000", +                            settlement="Haifa", +                            country="Israel", +                        ), +                    )), +                GrobidAuthor( +                    name="J Doe", +                    given_name="J", +                    surname="Doe", +                ), +            ], +            journal=GrobidJournal( +                name= +                "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", +            ), +            date="2000", +        ), +        abstract="Everything you ever wanted to know about nothing", +        body= +        "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", +        citations=[ +            GrobidCitation( +                index=0, +                id="b0", +                authors=[ +                    GrobidAuthor(name="A Seaperson", +                                 given_name="A", +                                 surname="Seaperson") +                ], +                date="2001", +                journal="Letters in the Alphabet", +                title="Everything is Wonderful", +                volume="20", +                pages="1-11", +            ), +            GrobidCitation( +                index=1, +                id="b1", +                authors=[], +                date="2011-03-28", +                journal="The Dictionary", +                title="All about Facts", +                volume="14", +            ), +        ], +    ) + +    assert doc == expected + + +def test_small_xml_json(): + +    with open('tests/files/small.xml', 'r') as f: +        tei_xml = f.read() +    with open('tests/files/small.json', 'r') as f: +        json_form = json.loads(f.read()) + +    d = parse_document_xml(tei_xml).to_dict() + +    # munge back to the old JSON format +    d.update(d.pop('header')) +    addr = d['authors'][0]['affiliation']['address'] +    addr['postCode'] = addr.pop('post_code') + +    # remove nulls from old JSON +    for c in json_form['citations']: +        for k in list(c.keys()): +            if c[k] == None: +                c.pop(k) + +    assert d == json_form + + +def test_invalid_xml(): + +    with pytest.raises(xml.etree.ElementTree.ParseError): +        parse_document_xml("this is not XML") +    with pytest.raises(xml.etree.ElementTree.ParseError): +        parse_citations_xml("this is not XML") +    with pytest.raises(ValueError): +        parse_document_xml("<xml></xml>") + + +def test_example_grobid_tei_xml() -> None: + +    with open("tests/files/example_grobid.tei.xml", "r") as f: +        blob = f.read() + +    doc = parse_document_xml(blob) + +    assert ( +        doc.header.title == +        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" +    ) + +    ref = [c for c in doc.citations if c.id == "b12"][0] +    assert ref.authors[0].name == "K Tasa" +    assert ref.authors[0].given_name == "K" +    assert ref.authors[0].surname == "Tasa" +    assert ref.journal == "Quality Management in Health Care" +    assert ref.title == "Using patient feedback for quality improvement" +    assert ref.date == "1996" +    assert ref.pages == "206-225" +    assert ref.volume == "8" +    assert ( +        ref.unstructured == +        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." +    ) + + +def test_single_citations_xml(): +    citation_xml = """ +<biblStruct > +    <analytic> +        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> +        <author> +            <persName +                xmlns="http://www.tei-c.org/ns/1.0"> +                <forename type="first">H</forename> +                <forename type="middle">B</forename> +                <surname>Cunningham</surname> +            </persName> +        </author> +        <author> +            <persName +                xmlns="http://www.tei-c.org/ns/1.0"> +                <forename type="first">J</forename> +                <forename type="middle">J</forename> +                <surname>Weis</surname> +            </persName> +        </author> +        <author> +            <persName +                xmlns="http://www.tei-c.org/ns/1.0"> +                <forename type="first">L</forename> +                <forename type="middle">R</forename> +                <surname>Taveras</surname> +            </persName> +        </author> +        <author> +            <persName +                xmlns="http://www.tei-c.org/ns/1.0"> +                <forename type="first">S</forename> +                <surname>Huerta</surname> +            </persName> +        </author> +        <idno type="DOI">10.1007/s10029-019-01898-9</idno> +        <idno type="PMID">30701369</idno> +    </analytic> +    <monogr> +        <title level="j">Hernia</title> +        <imprint> +            <biblScope unit="volume">23</biblScope> +            <biblScope unit="issue">2</biblScope> +            <biblScope unit="page" from="235" to="243" /> +            <date type="published" when="2019-01-30" /> +        </imprint> +    </monogr> +</biblStruct>""" + +    d = parse_citations_xml(citation_xml)[0] +    assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" +    assert d.authors[2].given_name == "L" +    assert d.authors[2].surname == "Taveras" +    assert d.authors[2].name == "L R Taveras" +    assert d.doi == "10.1007/s10029-019-01898-9" +    assert d.pmid == "30701369" +    assert d.date == "2019-01-30" +    assert d.pages == "235-243" +    assert d.volume == "23" +    assert d.issue == "2" +    assert d.journal == "Hernia" + + +def test_citation_list_xml(): + +    with open('tests/files/example_citation_list.xml', 'r') as f: +        tei_xml = f.read() + +    citations = parse_citations_xml(tei_xml) +    assert len(citations) == 10 +    assert citations[ +        7].title == "Global Hunger Index: The Challenge of Hidden Hunger"  | 
