summaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
blob: 976d1b162e70f4294d3d82773d36ea1ef7ea739d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import io
import json
import xml
import xml.etree.ElementTree

import pytest

from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml
from grobid_tei_xml.types import *


def test_small_xml() -> None:

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected_body = """Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED."""
    expected = GrobidDocument(
        grobid_version='0.5.1-SNAPSHOT',
        grobid_timestamp='2018-04-02T00:31+0000',
        language_code='en',
        header=GrobidBiblio(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(full_name="Brewster Kahle",
                             given_name="Brewster",
                             surname="Kahle",
                             affiliation=GrobidAffiliation(
                                 department="Faculty ofAgricultrial Engineering",
                                 laboratory="Plant Physiology Laboratory",
                                 institution="Technion-Israel Institute of Technology",
                                 address=GrobidAddress(
                                     post_code="32000",
                                     settlement="Haifa",
                                     country="Israel",
                                 ),
                             )),
                GrobidAuthor(
                    full_name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body=expected_body,
        citations=[
            GrobidBiblio(
                index=0,
                id="b0",
                authors=[
                    GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson")
                ],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
                first_page="1",
                last_page="11",
            ),
            GrobidBiblio(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
                note="author signed copy",
            ),
        ],
    )

    assert doc == expected


def test_small_xml_legacy() -> None:

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()
    with open('tests/files/small.json', 'r') as f:
        json_form = json.loads(f.read())

    d = parse_document_xml(tei_xml).to_legacy_dict()

    assert d == json_form


def test_invalid_xml() -> None:

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_citations_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")
    with pytest.raises(TypeError):
        parse_document_xml(123)  # type: ignore


def test_bytes() -> None:

    with open('tests/files/small.xml', 'rb') as f:
        tei_xml = f.read()

    parse_document_xml(tei_xml)
    parse_document_xml(io.BytesIO(tei_xml))  # type: ignore


def test_elementtree() -> None:

    with open('tests/files/small.xml', 'rb') as f:
        tei_xml = f.read()

    parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml)))  # type: ignore


def test_example_grobid_tei_xml() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert doc.header.title == \
        """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""

    ref = [c for c in doc.citations or [] if c.id == "b12"][0]
    assert ref.authors[0].full_name == "K Tasa"
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert ref.unstructured == \
        """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."""


def test_single_citations_xml() -> None:
    citation_xml = """
<biblStruct >
    <analytic>
        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">H</forename>
                <forename type="middle">B</forename>
                <surname>Cunningham</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">J</forename>
                <forename type="middle">J</forename>
                <surname>Weis</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">L</forename>
                <forename type="middle">R</forename>
                <surname>Taveras</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">S</forename>
                <surname>Huerta</surname>
            </persName>
        </author>
        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
        <idno type="PMID">30701369</idno>
    </analytic>
    <monogr>
        <title level="j">Hernia</title>
        <imprint>
            <biblScope unit="volume">23</biblScope>
            <biblScope unit="issue">2</biblScope>
            <biblScope unit="page" from="235" to="243" />
            <date type="published" when="2019-01-30" />
        </imprint>
    </monogr>
</biblStruct>"""

    d = parse_citations_xml(citation_xml)[0]
    assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review"""
    assert d.authors[2].given_name == "L"
    assert d.authors[2].middle_name == "R"
    assert d.authors[2].surname == "Taveras"
    assert d.authors[2].full_name == "L R Taveras"
    assert d.doi == "10.1007/s10029-019-01898-9"
    assert d.pmid == "30701369"
    assert d.date == "2019-01-30"
    assert d.pages == "235-243"
    assert d.first_page == "235"
    assert d.last_page == "243"
    assert d.volume == "23"
    assert d.issue == "2"
    assert d.journal == "Hernia"


def test_citation_list_xml() -> None:

    with open('tests/files/example_citation_list.xml', 'r') as f:
        tei_xml = f.read()

    citations = parse_citations_xml(tei_xml)
    assert len(citations) == 13

    assert citations[3].note == "The Research Handbook on International Environmental Law"
    assert citations[3].authors[0].surname == "Uhlířová"
    assert citations[3].authors[1].surname == "Drumbl"
    assert citations[3].editors
    assert citations[3].editors[0].surname == "Fitzmaurice"
    # TODO: multiple persName under a single <editor> (https://github.com/kermitt2/grobid/issues/845)
    # assert citations[3].editors[1].surname == "Brus"

    assert citations[4].authors[0].surname == "Sleytr"
    assert citations[4].authors[0].middle_name == "B"

    assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"

    assert citations[10].doi == "10.1093/eurheartj/ehi890"
    assert citations[10].url is None

    assert citations[11].title == "Devices, Measurements and Properties"
    assert citations[11].series_title == "Handbook of Optics"
    assert citations[11].publisher == "McGRAW-HILL"

    assert citations[
        12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River"
    assert citations[
        12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association"
    assert citations[12].institution == "University of Minnesota"


def test_grobid_070_document() -> None:
    # more recent GROBID v0.7.0 output

    with open('tests/files/example_grobid_plos.tei.xml', 'r') as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    assert doc.grobid_timestamp == "2021-10-23T03:05+0000"
    assert doc.grobid_version == "0.7.0-SNAPSHOT"
    assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5"

    assert doc.citations
    cite_b6 = doc.citations[6]
    assert cite_b6.id == "b6"
    assert cite_b6.journal == "OR. Hydrobiol"
    # note that this was not parsed well by GROBID
    assert cite_b6.institution == "Crater Lake National Park"
    assert cite_b6.date == "2007"
    assert cite_b6.volume == "574"
    assert cite_b6.issue == "1"

    cite_b3 = doc.citations[3]
    assert cite_b3.url == "http://unesdoc.unesco.org/ulis/"
    assert cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS"
    assert cite_b3.authors
    assert cite_b3.authors[0].surname == "Ioc-Unesco"
    assert cite_b3.date == "2012"

    cite_b18 = doc.citations[18]
    assert cite_b18.note == "TriOS GmbH [Internet"
    assert cite_b18.date == "2017-01-05"

    cite_b29 = doc.citations[29]
    assert cite_b29.note == "PhD dissertation"

    # run these methods over some more examples
    for c in doc.citations:
        c.to_csl_dict()
        c.to_dict()
        c.to_legacy_dict()