import xml
import json
import pytest
from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation
from grobid_tei_xml.types import *
def test_small_xml():
with open('tests/files/small.xml', 'r') as f:
tei_xml = f.read()
doc = parse_document_xml(tei_xml)
expected = GrobidDocument(
grobid_version='0.5.1-SNAPSHOT',
grobid_timestamp='2018-04-02T00:31+0000',
language_code='en',
header=GrobidHeader(
title="Dummy Example File",
authors=[
GrobidAuthor(
name="Brewster Kahle",
given_name="Brewster",
surname="Kahle",
affiliation=GrobidAffiliation(
department="Faculty ofAgricultrial Engineering",
laboratory="Plant Physiology Laboratory",
institution="Technion-Israel Institute of Technology",
address=GrobidAddress(
post_code="32000",
settlement="Haifa",
country="Israel",
),
)),
GrobidAuthor(
name="J Doe",
given_name="J",
surname="Doe",
),
],
journal=GrobidJournal(
name=
"Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
),
date="2000",
),
abstract="Everything you ever wanted to know about nothing",
body=
"Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
citations=[
GrobidCitation(
index=0,
id="b0",
authors=[
GrobidAuthor(name="A Seaperson",
given_name="A",
surname="Seaperson")
],
date="2001",
journal="Letters in the Alphabet",
title="Everything is Wonderful",
volume="20",
pages="1-11",
),
GrobidCitation(
index=1,
id="b1",
authors=[],
date="2011-03-28",
journal="The Dictionary",
title="All about Facts",
volume="14",
),
],
)
assert doc == expected
def test_small_xml_json():
with open('tests/files/small.xml', 'r') as f:
tei_xml = f.read()
with open('tests/files/small.json', 'r') as f:
json_form = json.loads(f.read())
d = parse_document_xml(tei_xml).to_dict()
# munge back to the old JSON format
d.update(d.pop('header'))
addr = d['authors'][0]['affiliation']['address']
addr['postCode'] = addr.pop('post_code')
# remove nulls from old JSON
for c in json_form['citations']:
for k in list(c.keys()):
if c[k] == None:
c.pop(k)
assert d == json_form
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
parse_document_xml("this is not XML")
with pytest.raises(xml.etree.ElementTree.ParseError):
parse_citations_xml("this is not XML")
with pytest.raises(ValueError):
parse_document_xml("")
def test_example_grobid_tei_xml() -> None:
with open("tests/files/example_grobid.tei.xml", "r") as f:
blob = f.read()
doc = parse_document_xml(blob)
assert (
doc.header.title ==
"Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
)
ref = [c for c in doc.citations if c.id == "b12"][0]
assert ref.authors[0].name == "K Tasa"
assert ref.authors[0].given_name == "K"
assert ref.authors[0].surname == "Tasa"
assert ref.journal == "Quality Management in Health Care"
assert ref.title == "Using patient feedback for quality improvement"
assert ref.date == "1996"
assert ref.pages == "206-225"
assert ref.volume == "8"
assert (
ref.unstructured ==
"Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
)
def test_single_citations_xml():
citation_xml = """
Mesh migration following abdominal hernia repair: a comprehensive review
H
B
Cunningham
J
J
Weis
L
R
Taveras
S
Huerta
10.1007/s10029-019-01898-9
30701369
Hernia
23
2
"""
d = parse_citations_xml(citation_xml)[0]
assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
assert d.authors[2].given_name == "L"
assert d.authors[2].surname == "Taveras"
assert d.authors[2].name == "L R Taveras"
assert d.doi == "10.1007/s10029-019-01898-9"
assert d.pmid == "30701369"
assert d.date == "2019-01-30"
assert d.pages == "235-243"
assert d.volume == "23"
assert d.issue == "2"
assert d.journal == "Hernia"
def test_citation_list_xml():
with open('tests/files/example_citation_list.xml', 'r') as f:
tei_xml = f.read()
citations = parse_citations_xml(tei_xml)
assert len(citations) == 10
assert citations[
7].title == "Global Hunger Index: The Challenge of Hidden Hunger"