import io
import json
import xml
import xml.etree.ElementTree
import pytest
from grobid_tei_xml import (
GrobidBiblio,
GrobidDocument,
parse_citation_list_xml,
parse_citation_xml,
parse_citations_xml,
parse_document_xml,
)
from grobid_tei_xml.types import *
def test_small_xml() -> None:
with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
doc = parse_document_xml(tei_xml)
expected_body = """Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED."""
expected = GrobidDocument(
grobid_version="0.5.1-SNAPSHOT",
grobid_timestamp="2018-04-02T00:31+0000",
language_code="en",
header=GrobidBiblio(
title="Dummy Example File",
authors=[
GrobidAuthor(
full_name="Brewster Kahle",
given_name="Brewster",
surname="Kahle",
affiliation=GrobidAffiliation(
department="Faculty ofAgricultrial Engineering",
laboratory="Plant Physiology Laboratory",
institution="Technion-Israel Institute of Technology",
address=GrobidAddress(
post_code="32000",
settlement="Haifa",
country="Israel",
),
),
),
GrobidAuthor(
full_name="J Doe",
given_name="J",
surname="Doe",
),
],
book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
date="2000",
),
abstract="Everything you ever wanted to know about nothing",
body=expected_body,
citations=[
GrobidBiblio(
index=0,
id="b0",
authors=[
GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson")
],
date="2001",
journal="Letters in the Alphabet",
title="Everything is Wonderful",
volume="20",
pages="1-11",
first_page="1",
last_page="11",
),
GrobidBiblio(
index=1,
id="b1",
authors=[],
date="2011-03-28",
journal="The Dictionary",
title="All about Facts",
volume="14",
note="author signed copy",
),
],
)
assert doc == expected
def test_small_xml_legacy() -> None:
with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
with open("tests/files/small.json", "r") as f:
json_form = json.loads(f.read())
d = parse_document_xml(tei_xml).to_legacy_dict()
assert d == json_form
def test_invalid_xml() -> None:
with pytest.raises(xml.etree.ElementTree.ParseError):
parse_document_xml("this is not XML")
with pytest.raises(xml.etree.ElementTree.ParseError):
parse_citations_xml("this is not XML")
with pytest.raises(ValueError):
parse_document_xml("")
with pytest.raises(TypeError):
parse_document_xml(123) # type: ignore
def test_bytes() -> None:
with open("tests/files/small.xml", "rb") as f:
tei_xml = f.read()
parse_document_xml(tei_xml)
parse_document_xml(io.BytesIO(tei_xml)) # type: ignore
def test_elementtree() -> None:
with open("tests/files/small.xml", "rb") as f:
tei_xml = f.read()
parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml))) # type: ignore
def test_example_grobid_tei_xml() -> None:
with open("tests/files/document/example.tei.xml", "r") as f:
blob = f.read()
doc = parse_document_xml(blob)
assert (
doc.header.title
== """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""
)
ref = [c for c in doc.citations or [] if c.id == "b12"][0]
assert ref.authors[0].full_name == "K Tasa"
assert ref.authors[0].given_name == "K"
assert ref.authors[0].surname == "Tasa"
assert ref.journal == "Quality Management in Health Care"
assert ref.title == "Using patient feedback for quality improvement"
assert ref.date == "1996"
assert ref.pages == "206-225"
assert ref.volume == "8"
assert (
ref.unstructured
== """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."""
)
def test_single_citations_xml() -> None:
citation_xml = """
Mesh migration following abdominal hernia repair: a comprehensive review
H
B
Cunningham
J
J
Weis
L
R
Taveras
S
Huerta
10.1007/s10029-019-01898-9
30701369
Hernia
23
2
"""
d = parse_citation_xml(citation_xml)
assert d
assert (
d.title
== """Mesh migration following abdominal hernia repair: a comprehensive review"""
)
assert d.authors[2].given_name == "L"
assert d.authors[2].middle_name == "R"
assert d.authors[2].surname == "Taveras"
assert d.authors[2].full_name == "L R Taveras"
assert d.doi == "10.1007/s10029-019-01898-9"
assert d.pmid == "30701369"
assert d.date == "2019-01-30"
assert d.pages == "235-243"
assert d.first_page == "235"
assert d.last_page == "243"
assert d.volume == "23"
assert d.issue == "2"
assert d.journal == "Hernia"
d2 = parse_citations_xml(citation_xml)[0]
assert d.title == d2.title
assert d.authors == d2.authors
def test_citation_list_xml() -> None:
with open("tests/files/citation_list/example.tei.xml", "r") as f:
tei_xml = f.read()
citations = parse_citation_list_xml(tei_xml)
# verify that old function still works
assert citations == parse_citations_xml(tei_xml)
assert len(citations) == 13
assert citations[3].note == "The Research Handbook on International Environmental Law"
assert citations[3].authors[0].surname == "Uhlířová"
assert citations[3].authors[1].surname == "Drumbl"
assert citations[3].editors
assert citations[3].editors[0].surname == "Fitzmaurice"
# TODO: multiple persName under a single (https://github.com/kermitt2/grobid/issues/845)
# assert citations[3].editors[1].surname == "Brus"
assert citations[4].authors[0].surname == "Sleytr"
assert citations[4].authors[0].middle_name == "B"
assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"
assert citations[10].doi == "10.1093/eurheartj/ehi890"
assert citations[10].url is None
assert citations[11].title == "Devices, Measurements and Properties"
assert citations[11].series_title == "Handbook of Optics"
assert citations[11].publisher == "McGRAW-HILL"
assert (
citations[12].title
== "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River"
)
assert (
citations[12].book_title
== "Paper presented at the 13th Biennial Meeting of the American Quaternary Association"
)
assert citations[12].institution == "University of Minnesota"
def test_grobid_070_document() -> None:
# more recent GROBID v0.7.0 output
with open("tests/files/document/plos.tei.xml", "r") as f:
tei_xml = f.read()
doc = parse_document_xml(tei_xml)
assert doc.grobid_timestamp == "2021-10-23T03:05+0000"
assert doc.grobid_version == "0.7.0-SNAPSHOT"
assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5"
assert doc.citations
cite_b6 = doc.citations[6]
assert cite_b6.id == "b6"
assert cite_b6.journal == "OR. Hydrobiol"
# note that this was not parsed well by GROBID
assert cite_b6.institution == "Crater Lake National Park"
assert cite_b6.date == "2007"
assert cite_b6.volume == "574"
assert cite_b6.issue == "1"
cite_b3 = doc.citations[3]
assert cite_b3.url == "http://unesdoc.unesco.org/ulis/"
assert (
cite_b3.title
== "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS"
)
assert cite_b3.authors
assert cite_b3.authors[0].surname == "Ioc-Unesco"
assert cite_b3.date == "2012"
cite_b18 = doc.citations[18]
assert cite_b18.note == "TriOS GmbH [Internet"
assert cite_b18.date == "2017-01-05"
cite_b29 = doc.citations[29]
assert cite_b29.note == "PhD dissertation"
# run these methods over some more examples
for c in doc.citations:
c.to_csl_dict()
c.to_dict()
c.to_legacy_dict()
def test_empty_citations() -> None:
with open("tests/files/citation/empty_unstructured.tei.xml", "r") as f:
mostly_empty_xml = f.read()
with open("tests/files/citation/empty.tei.xml", "r") as f:
empty_xml = f.read()
assert parse_citation_xml(empty_xml) is None
assert parse_citation_xml(mostly_empty_xml) is None
d = parse_citation_list_xml(empty_xml)
assert d
assert d[0].index == 0
assert d[0].unstructured is None
d2 = parse_citation_list_xml(mostly_empty_xml)
assert d2
assert d2[0].index == 0
assert d2[0].unstructured == "blah"
def test_citation_emdash() -> None:
with open("tests/files/citation/emdash.tei.xml", "rb") as f:
tei_xml_bytes = f.read()
with open("tests/files/citation/emdash.tei.xml", "r") as f2:
tei_xml_str = f2.read()
# that dash is a unicode emdash
unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
assert unstructured[70:81] == "pp. 155\u2013172"
assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes
assert "pp. 155\u2013172" in tei_xml_str
ref_bytes = parse_citation_xml(tei_xml_bytes)
assert ref_bytes
assert ref_bytes.unstructured == unstructured
assert ref_bytes.first_page == "155"
assert ref_bytes.pages == "155-172"
ref_str = parse_citation_xml(tei_xml_str)
assert ref_str
assert ref_str.unstructured == unstructured
assert ref_str.first_page == "155"
assert ref_str.pages == "155-172"
def test_citation_list_utf8() -> None:
with open("tests/files/citation_list/emdash.tei.xml", "rb") as f:
tei_xml_bytes = f.read()
with open("tests/files/citation_list/emdash.tei.xml", "r") as f2:
tei_xml_str = f2.read()
unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
assert unstructured[70:81] == "pp. 155\u2013172"
ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0]
assert ref_bytes
assert ref_bytes.unstructured == unstructured
assert ref_bytes.first_page == "155"
assert ref_bytes.pages == "155-172"
ref_str = parse_citation_list_xml(tei_xml_str)[0]
assert ref_str
assert ref_str.unstructured == unstructured
assert ref_str.first_page == "155"
assert ref_str.pages == "155-172"
def test_citation_multiple_editors() -> None:
with open(
"tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r"
) as f:
tei_xml = f.read()
ref = parse_citation_xml(tei_xml)
assert ref
assert ref.title == "Uterine cancer"
assert len(ref.authors) == 1
assert ref.authors[0].full_name == "J R Lurain"
assert ref.authors[0].middle_name == "R"
assert ref.authors[0].surname == "Lurain"
assert ref.editors
assert len(ref.editors) == 3
assert ref.editors[0].full_name == "J S Berek"
assert ref.editors[1].full_name == "E Y Adashi"
assert ref.editors[2].full_name == "P A Hillard"
assert ref.book_title == "Novak’s gynecology"
assert ref.publisher == "Williams and Wilkins"
assert ref.date == "1996"
assert ref.note == "12th ed. Baltimore"
def test_author_email() -> None:
with open("tests/files/document/author_email.tei.xml", "r") as f:
tei_xml = f.read()
doc = parse_document_xml(tei_xml)
biblio = doc.header
assert biblio
assert biblio.title == "Task-Based Intelligent Retrieval and Recommendation"
assert biblio.authors
assert biblio.authors[0].given_name == "Chirag"
assert biblio.authors[0].surname == "Shah"
assert biblio.authors[0].email == "redacted@example.com"
assert biblio.authors[0].affiliation
assert biblio.authors[0].affiliation.institution == "University of Washington"
assert biblio.authors[0].affiliation.address
assert biblio.authors[0].affiliation.address.settlement == "Seattle"
assert biblio.authors[0].affiliation.address.country == "USA"
assert doc.pdf_md5 == "6C18173427FE3FAD756BB2F4F7665855"
assert doc.grobid_version == "0.7.1-SNAPSHOT"
assert doc.grobid_timestamp == "2021-11-02T09:03+0000"
assert doc.language_code == "en"
assert doc.abstract
assert doc.abstract[:50] == "While the act of looking for information happens within a"[:50]
assert doc.citations == []
assert doc.body is None
assert doc.acknowledgement is None
assert doc.annex is None
def test_citation_bare_editor() -> None:
with open("tests/files/citation/editor_no_persname.tei.xml", "r") as f:
tei_xml = f.read()
ref = parse_citation_xml(tei_xml)
assert ref
assert ref.title is None
assert ref.journal == "Clinical Gynecologic Endocrinology and Infertility"
assert ref.publisher == "Williams and Wilkins"
assert ref.date == "1994"
assert ref.authors == []
assert ref.editors
assert len(ref.editors) == 1
assert ref.editors[0].full_name == "Mitchell C"
def test_citation_proceedings() -> None:
with open("tests/files/citation/citation_proceedings.tei.xml", "r") as f:
tei_xml = f.read()
ref = parse_citation_xml(tei_xml)
assert ref
assert ref.title == "Sim-to-Real Transfer of Robotic Control with Dynamics Randomization"
assert (
ref.book_title
== "Proceedings - IEEE International Conference on Robotics and Automation"
)
# TODO: ref.meeting: IEEE International Conference on Robotics and Automation
assert ref.publisher is None
# 2018. 2018
assert ref.pages == "3803-3810"
assert ref.date == "2018"
assert ref.authors
assert ref.authors[0].given_name == "Marcin"
assert ref.authors[3].full_name == "Abbeel"
assert ref.editors is None