summaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
blob: 30b292659cb456d57e1f5ee9a5a40d2abda9b070 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import io
import json
import xml
import xml.etree.ElementTree

import pytest

from grobid_tei_xml import (GrobidCitation, GrobidDocument, parse_citations_xml,
                            parse_document_xml)
from grobid_tei_xml.types import *


def test_small_xml() -> None:

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected_body = """Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED."""
    expected = GrobidDocument(
        grobid_version='0.5.1-SNAPSHOT',
        grobid_timestamp='2018-04-02T00:31+0000',
        language_code='en',
        header=GrobidHeader(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(name="Brewster Kahle",
                             given_name="Brewster",
                             surname="Kahle",
                             affiliation=GrobidAffiliation(
                                 department="Faculty ofAgricultrial Engineering",
                                 laboratory="Plant Physiology Laboratory",
                                 institution="Technion-Israel Institute of Technology",
                                 address=GrobidAddress(
                                     post_code="32000",
                                     settlement="Haifa",
                                     country="Israel",
                                 ),
                             )),
                GrobidAuthor(
                    name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            journal=GrobidJournal(
                name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ),
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body=expected_body,
        citations=[
            GrobidCitation(
                index=0,
                id="b0",
                authors=[GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson")],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
            ),
            GrobidCitation(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
            ),
        ],
    )

    assert doc == expected


def test_small_xml_json() -> None:

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()
    with open('tests/files/small.json', 'r') as f:
        json_form = json.loads(f.read())

    d = parse_document_xml(tei_xml).to_dict()

    # munge back to the old JSON format
    d.update(d.pop('header'))
    addr = d['authors'][0]['affiliation']['address']
    addr['postCode'] = addr.pop('post_code')

    # remove nulls from old JSON
    for c in json_form['citations']:
        for k in list(c.keys()):
            if c[k] is None:
                c.pop(k)

    assert d == json_form


def test_invalid_xml() -> None:

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_citations_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")
    with pytest.raises(TypeError):
        parse_document_xml(123)  # type: ignore


def test_bytes() -> None:

    with open('tests/files/small.xml', 'rb') as f:
        tei_xml = f.read()

    parse_document_xml(tei_xml)
    parse_document_xml(io.BytesIO(tei_xml))  # type: ignore


def test_elementtree() -> None:

    with open('tests/files/small.xml', 'rb') as f:
        tei_xml = f.read()

    parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml)))  # type: ignore


def test_example_grobid_tei_xml() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert doc.header.title == \
        """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""

    ref = [c for c in doc.citations or [] if c.id == "b12"][0]
    assert ref.authors[0].name == "K Tasa"
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert ref.unstructured == \
        """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."""


def test_single_citations_xml() -> None:
    citation_xml = """
<biblStruct >
    <analytic>
        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">H</forename>
                <forename type="middle">B</forename>
                <surname>Cunningham</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">J</forename>
                <forename type="middle">J</forename>
                <surname>Weis</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">L</forename>
                <forename type="middle">R</forename>
                <surname>Taveras</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">S</forename>
                <surname>Huerta</surname>
            </persName>
        </author>
        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
        <idno type="PMID">30701369</idno>
    </analytic>
    <monogr>
        <title level="j">Hernia</title>
        <imprint>
            <biblScope unit="volume">23</biblScope>
            <biblScope unit="issue">2</biblScope>
            <biblScope unit="page" from="235" to="243" />
            <date type="published" when="2019-01-30" />
        </imprint>
    </monogr>
</biblStruct>"""

    d = parse_citations_xml(citation_xml)[0]
    assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review"""
    assert d.authors[2].given_name == "L"
    assert d.authors[2].surname == "Taveras"
    assert d.authors[2].name == "L R Taveras"
    assert d.doi == "10.1007/s10029-019-01898-9"
    assert d.pmid == "30701369"
    assert d.date == "2019-01-30"
    assert d.pages == "235-243"
    assert d.volume == "23"
    assert d.issue == "2"
    assert d.journal == "Hernia"


def test_citation_list_xml() -> None:

    with open('tests/files/example_citation_list.xml', 'r') as f:
        tei_xml = f.read()

    citations = parse_citations_xml(tei_xml)
    assert len(citations) == 10
    assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"