summaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
blob: e79d41dd2e46df65ec7cb2cf17e5dde86aa8587e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import xml
import json
import pytest

from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation
from grobid_tei_xml.types import *


def test_small_xml():

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected = GrobidDocument(
        grobid_version='0.5.1-SNAPSHOT',
        grobid_timestamp='2018-04-02T00:31+0000',
        language_code='en',
        header=GrobidHeader(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(
                    name="Brewster Kahle",
                    given_name="Brewster",
                    surname="Kahle",
                    affiliation=GrobidAffiliation(
                        department="Faculty ofAgricultrial Engineering",
                        laboratory="Plant Physiology Laboratory",
                        institution="Technion-Israel Institute of Technology",
                        address=GrobidAddress(
                            post_code="32000",
                            settlement="Haifa",
                            country="Israel",
                        ),
                    )),
                GrobidAuthor(
                    name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            journal=GrobidJournal(
                name=
                "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
            ),
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body=
        "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
        citations=[
            GrobidCitation(
                index=0,
                id="b0",
                authors=[
                    GrobidAuthor(name="A Seaperson",
                                 given_name="A",
                                 surname="Seaperson")
                ],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
            ),
            GrobidCitation(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
            ),
        ],
    )

    assert doc == expected


def test_small_xml_json():

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()
    with open('tests/files/small.json', 'r') as f:
        json_form = json.loads(f.read())

    d = parse_document_xml(tei_xml).to_dict()

    # munge back to the old JSON format
    d.update(d.pop('header'))
    addr = d['authors'][0]['affiliation']['address']
    addr['postCode'] = addr.pop('post_code')

    # remove nulls from old JSON
    for c in json_form['citations']:
        for k in list(c.keys()):
            if c[k] == None:
                c.pop(k)

    assert d == json_form


def test_invalid_xml():

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_citations_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")


def test_example_grobid_tei_xml() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert (
        doc.header.title ==
        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
    )

    ref = [c for c in doc.citations if c.id == "b12"][0]
    assert ref.authors[0].name == "K Tasa"
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )


def test_single_citations_xml():
    citation_xml = """
<biblStruct >
    <analytic>
        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">H</forename>
                <forename type="middle">B</forename>
                <surname>Cunningham</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">J</forename>
                <forename type="middle">J</forename>
                <surname>Weis</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">L</forename>
                <forename type="middle">R</forename>
                <surname>Taveras</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">S</forename>
                <surname>Huerta</surname>
            </persName>
        </author>
        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
        <idno type="PMID">30701369</idno>
    </analytic>
    <monogr>
        <title level="j">Hernia</title>
        <imprint>
            <biblScope unit="volume">23</biblScope>
            <biblScope unit="issue">2</biblScope>
            <biblScope unit="page" from="235" to="243" />
            <date type="published" when="2019-01-30" />
        </imprint>
    </monogr>
</biblStruct>"""

    d = parse_citations_xml(citation_xml)[0]
    assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
    assert d.authors[2].given_name == "L"
    assert d.authors[2].surname == "Taveras"
    assert d.authors[2].name == "L R Taveras"
    assert d.doi == "10.1007/s10029-019-01898-9"
    assert d.pmid == "30701369"
    assert d.date == "2019-01-30"
    assert d.pages == "235-243"
    assert d.volume == "23"
    assert d.issue == "2"
    assert d.journal == "Hernia"


def test_citation_list_xml():

    with open('tests/files/example_citation_list.xml', 'r') as f:
        tei_xml = f.read()

    citations = parse_citations_xml(tei_xml)
    assert len(citations) == 10
    assert citations[
        7].title == "Global Hunger Index: The Challenge of Hidden Hunger"