summaryrefslogtreecommitdiffstats
path: root/tests/test_grobid2json.py
blob: ed5d996978a2a99406b83199f5df40d1b4481422 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import xml
import json
import pytest

from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation
from grobid_tei_xml.types import *


def test_teixml2json_small_xml():

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()
    with open('tests/files/small.json', 'r') as f:
        json_form = json.loads(f.read())

    assert teixml2json(tei_xml) == json_form

    assert parse_document_xml(tei_xml).to_dict() == json_form

def test_teixml2json_small_xml():

    with open('tests/files/small.xml', 'r') as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected = GrobidDocument(
        grobid_version='0.5.1-SNAPSHOT',
        grobid_timestamp='2018-04-02T00:31+0000',
        language_code='en',
        header=GrobidHeader(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(
                    name="Brewster Kahle",
                    given_name="Brewster",
                    surname="Kahle",
                    affiliation=GrobidAffiliation(
                        department="Faculty ofAgricultrial Engineering",
                        laboratory="Plant Physiology Laboratory",
                        institution="Technion-Israel Institute of Technology",
                        address=GrobidAddress(
                            post_code="32000",
                            settlement="Haifa",
                            country="Israel",
                        ),
                    )
                ),
                GrobidAuthor(
                    name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            journal=GrobidJournal(
                name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
            ),
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body="Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
        citations=[
            GrobidCitation(
                index=0,
                id="b0",
                authors=[
                    GrobidAuthor(
                        name="A Seaperson",
                        given_name="A",
                        surname="Seaperson"
                    )
                ],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
            ),
            GrobidCitation(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
            ),
        ],
    )

    assert doc == expected


def test_invalid_xml():

    with pytest.raises(xml.etree.ElementTree.ParseError):
        teixml2json("this is not XML")
    with pytest.raises(ValueError):
        teixml2json("<xml></xml>")


def test_grobid_teixml2json() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    obj = teixml2json(blob, True)

    assert (
        obj["title"] ==
        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
    )

    ref = [c for c in obj["citations"] if c["id"] == "b12"][0]
    assert ref["authors"][0] == {
        "given_name": "K",
        "name": "K Tasa",
        "surname": "Tasa"
    }
    assert ref["journal"] == "Quality Management in Health Care"
    assert ref["title"] == "Using patient feedback for quality improvement"
    assert ref["date"] == "1996"
    assert ref["pages"] == "206-225"
    assert ref["volume"] == "8"
    assert (
        ref["unstructured"] ==
        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
    )