aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
blob: 9d8f4ff954cb04eacbcdef352848886da628e801 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import io
import json
import xml
import xml.etree.ElementTree

import pytest

from grobid_tei_xml import (
    GrobidBiblio,
    GrobidDocument,
    parse_citation_list_xml,
    parse_citation_xml,
    parse_citations_xml,
    parse_document_xml,
)
from grobid_tei_xml.types import *


def test_small_xml() -> None:

    with open("tests/files/small.xml", "r") as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected_body = """Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED."""
    expected = GrobidDocument(
        grobid_version="0.5.1-SNAPSHOT",
        grobid_timestamp="2018-04-02T00:31+0000",
        language_code="en",
        header=GrobidBiblio(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(
                    full_name="Brewster Kahle",
                    given_name="Brewster",
                    surname="Kahle",
                    affiliation=GrobidAffiliation(
                        department="Faculty ofAgricultrial Engineering",
                        laboratory="Plant Physiology Laboratory",
                        institution="Technion-Israel Institute of Technology",
                        address=GrobidAddress(
                            post_code="32000",
                            settlement="Haifa",
                            country="Israel",
                        ),
                    ),
                ),
                GrobidAuthor(
                    full_name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body=expected_body,
        citations=[
            GrobidBiblio(
                index=0,
                id="b0",
                authors=[
                    GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson")
                ],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
                first_page="1",
                last_page="11",
            ),
            GrobidBiblio(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
                note="author signed copy",
            ),
        ],
    )

    assert doc == expected


def test_small_xml_legacy() -> None:

    with open("tests/files/small.xml", "r") as f:
        tei_xml = f.read()
    with open("tests/files/small.json", "r") as f:
        json_form = json.loads(f.read())

    d = parse_document_xml(tei_xml).to_legacy_dict()

    assert d == json_form


def test_invalid_xml() -> None:

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_citations_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")
    with pytest.raises(TypeError):
        parse_document_xml(123)  # type: ignore


def test_bytes() -> None:

    with open("tests/files/small.xml", "rb") as f:
        tei_xml = f.read()

    parse_document_xml(tei_xml)
    parse_document_xml(io.BytesIO(tei_xml))  # type: ignore


def test_elementtree() -> None:

    with open("tests/files/small.xml", "rb") as f:
        tei_xml = f.read()

    parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml)))  # type: ignore


def test_example_grobid_tei_xml() -> None:

    with open("tests/files/document/example.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert (
        doc.header.title
        == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""
    )

    ref = [c for c in doc.citations or [] if c.id == "b12"][0]
    assert ref.authors[0].full_name == "K Tasa"
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured
        == """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."""
    )


def test_single_citations_xml() -> None:
    citation_xml = """
<biblStruct >
    <analytic>
        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">H</forename>
                <forename type="middle">B</forename>
                <surname>Cunningham</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">J</forename>
                <forename type="middle">J</forename>
                <surname>Weis</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">L</forename>
                <forename type="middle">R</forename>
                <surname>Taveras</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">S</forename>
                <surname>Huerta</surname>
            </persName>
        </author>
        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
        <idno type="PMID">30701369</idno>
    </analytic>
    <monogr>
        <title level="j">Hernia</title>
        <imprint>
            <biblScope unit="volume">23</biblScope>
            <biblScope unit="issue">2</biblScope>
            <biblScope unit="page" from="235" to="243" />
            <date type="published" when="2019-01-30" />
        </imprint>
    </monogr>
</biblStruct>"""

    d = parse_citation_xml(citation_xml)
    assert d
    assert (
        d.title
        == """Mesh migration following abdominal hernia repair: a comprehensive review"""
    )
    assert d.authors[2].given_name == "L"
    assert d.authors[2].middle_name == "R"
    assert d.authors[2].surname == "Taveras"
    assert d.authors[2].full_name == "L R Taveras"
    assert d.doi == "10.1007/s10029-019-01898-9"
    assert d.pmid == "30701369"
    assert d.date == "2019-01-30"
    assert d.pages == "235-243"
    assert d.first_page == "235"
    assert d.last_page == "243"
    assert d.volume == "23"
    assert d.issue == "2"
    assert d.journal == "Hernia"

    d2 = parse_citations_xml(citation_xml)[0]
    assert d.title == d2.title
    assert d.authors == d2.authors


def test_citation_list_xml() -> None:

    with open("tests/files/citation_list/example.tei.xml", "r") as f:
        tei_xml = f.read()

    citations = parse_citation_list_xml(tei_xml)

    # verify that old function still works
    assert citations == parse_citations_xml(tei_xml)

    assert len(citations) == 13

    assert citations[3].note == "The Research Handbook on International Environmental Law"
    assert citations[3].authors[0].surname == "Uhlířová"
    assert citations[3].authors[1].surname == "Drumbl"
    assert citations[3].editors
    assert citations[3].editors[0].surname == "Fitzmaurice"
    # TODO: multiple persName under a single <editor> (https://github.com/kermitt2/grobid/issues/845)
    # assert citations[3].editors[1].surname == "Brus"

    assert citations[4].authors[0].surname == "Sleytr"
    assert citations[4].authors[0].middle_name == "B"

    assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"

    assert citations[10].doi == "10.1093/eurheartj/ehi890"
    assert citations[10].url is None

    assert citations[11].title == "Devices, Measurements and Properties"
    assert citations[11].series_title == "Handbook of Optics"
    assert citations[11].publisher == "McGRAW-HILL"

    assert (
        citations[12].title
        == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River"
    )
    assert (
        citations[12].book_title
        == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association"
    )
    assert citations[12].institution == "University of Minnesota"


def test_grobid_070_document() -> None:
    # more recent GROBID v0.7.0 output

    with open("tests/files/document/plos.tei.xml", "r") as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    assert doc.grobid_timestamp == "2021-10-23T03:05+0000"
    assert doc.grobid_version == "0.7.0-SNAPSHOT"
    assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5"

    assert doc.citations
    cite_b6 = doc.citations[6]
    assert cite_b6.id == "b6"
    assert cite_b6.journal == "OR. Hydrobiol"
    # note that this was not parsed well by GROBID
    assert cite_b6.institution == "Crater Lake National Park"
    assert cite_b6.date == "2007"
    assert cite_b6.volume == "574"
    assert cite_b6.issue == "1"

    cite_b3 = doc.citations[3]
    assert cite_b3.url == "http://unesdoc.unesco.org/ulis/"
    assert (
        cite_b3.title
        == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS"
    )
    assert cite_b3.authors
    assert cite_b3.authors[0].surname == "Ioc-Unesco"
    assert cite_b3.date == "2012"

    cite_b18 = doc.citations[18]
    assert cite_b18.note == "TriOS GmbH [Internet"
    assert cite_b18.date == "2017-01-05"

    cite_b29 = doc.citations[29]
    assert cite_b29.note == "PhD dissertation"

    # run these methods over some more examples
    for c in doc.citations:
        c.to_csl_dict()
        c.to_dict()
        c.to_legacy_dict()


def test_empty_citations() -> None:

    with open("tests/files/citation/empty_unstructured.tei.xml", "r") as f:
        mostly_empty_xml = f.read()

    with open("tests/files/citation/empty.tei.xml", "r") as f:
        empty_xml = f.read()

    assert parse_citation_xml(empty_xml) is None
    assert parse_citation_xml(mostly_empty_xml) is None

    d = parse_citation_list_xml(empty_xml)
    assert d
    assert d[0].index == 0
    assert d[0].unstructured is None

    d2 = parse_citation_list_xml(mostly_empty_xml)
    assert d2
    assert d2[0].index == 0
    assert d2[0].unstructured == "blah"


def test_citation_emdash() -> None:

    with open("tests/files/citation/emdash.tei.xml", "rb") as f:
        tei_xml_bytes = f.read()
    with open("tests/files/citation/emdash.tei.xml", "r") as f2:
        tei_xml_str = f2.read()

    # that dash is a unicode emdash
    unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
    assert unstructured[70:81] == "pp. 155\u2013172"
    assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes
    assert "pp. 155\u2013172" in tei_xml_str

    ref_bytes = parse_citation_xml(tei_xml_bytes)
    assert ref_bytes
    assert ref_bytes.unstructured == unstructured
    assert ref_bytes.first_page == "155"
    assert ref_bytes.pages == "155-172"

    ref_str = parse_citation_xml(tei_xml_str)
    assert ref_str
    assert ref_str.unstructured == unstructured
    assert ref_str.first_page == "155"
    assert ref_str.pages == "155-172"


def test_citation_list_utf8() -> None:
    with open("tests/files/citation_list/emdash.tei.xml", "rb") as f:
        tei_xml_bytes = f.read()
    with open("tests/files/citation_list/emdash.tei.xml", "r") as f2:
        tei_xml_str = f2.read()

    unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
    assert unstructured[70:81] == "pp. 155\u2013172"

    ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0]
    assert ref_bytes
    assert ref_bytes.unstructured == unstructured
    assert ref_bytes.first_page == "155"
    assert ref_bytes.pages == "155-172"

    ref_str = parse_citation_list_xml(tei_xml_str)[0]
    assert ref_str
    assert ref_str.unstructured == unstructured
    assert ref_str.first_page == "155"
    assert ref_str.pages == "155-172"


def test_citation_multiple_editors() -> None:
    with open(
        "tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r"
    ) as f:
        tei_xml = f.read()

    ref = parse_citation_xml(tei_xml)
    assert ref
    assert ref.title == "Uterine cancer"
    assert len(ref.authors) == 1
    assert ref.authors[0].full_name == "J R Lurain"
    assert ref.authors[0].middle_name == "R"
    assert ref.authors[0].surname == "Lurain"
    assert ref.editors
    assert len(ref.editors) == 3
    assert ref.editors[0].full_name == "J S Berek"
    assert ref.editors[1].full_name == "E Y Adashi"
    assert ref.editors[2].full_name == "P A Hillard"
    assert ref.book_title == "Novak’s gynecology"
    assert ref.publisher == "Williams and Wilkins"
    assert ref.date == "1996"
    assert ref.note == "12th ed. Baltimore"


def test_author_email() -> None:
    with open("tests/files/document/author_email.tei.xml", "r") as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    biblio = doc.header
    assert biblio
    assert biblio.title == "Task-Based Intelligent Retrieval and Recommendation"
    assert biblio.authors
    assert biblio.authors[0].given_name == "Chirag"
    assert biblio.authors[0].surname == "Shah"
    assert biblio.authors[0].email == "redacted@example.com"
    assert biblio.authors[0].affiliation
    assert biblio.authors[0].affiliation.institution == "University of Washington"
    assert biblio.authors[0].affiliation.address
    assert biblio.authors[0].affiliation.address.settlement == "Seattle"
    assert biblio.authors[0].affiliation.address.country == "USA"

    assert doc.pdf_md5 == "6C18173427FE3FAD756BB2F4F7665855"
    assert doc.grobid_version == "0.7.1-SNAPSHOT"
    assert doc.grobid_timestamp == "2021-11-02T09:03+0000"
    assert doc.language_code == "en"
    assert doc.abstract
    assert doc.abstract[:50] == "While the act of looking for information happens within a"[:50]
    assert doc.citations == []
    assert doc.body is None
    assert doc.acknowledgement is None
    assert doc.annex is None