import io import json import xml import xml.etree.ElementTree import pytest from grobid_tei_xml import ( GrobidBiblio, GrobidDocument, parse_citation_list_xml, parse_citation_xml, parse_citations_xml, parse_document_xml, ) from grobid_tei_xml.types import * def test_small_xml() -> None: with open("tests/files/small.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) expected_body = """Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.""" expected = GrobidDocument( grobid_version="0.5.1-SNAPSHOT", grobid_timestamp="2018-04-02T00:31+0000", language_code="en", header=GrobidBiblio( title="Dummy Example File", authors=[ GrobidAuthor( full_name="Brewster Kahle", given_name="Brewster", surname="Kahle", affiliation=GrobidAffiliation( department="Faculty ofAgricultrial Engineering", laboratory="Plant Physiology Laboratory", institution="Technion-Israel Institute of Technology", address=GrobidAddress( post_code="32000", settlement="Haifa", country="Israel", ), ), ), GrobidAuthor( full_name="J Doe", given_name="J", surname="Doe", ), ], book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", date="2000", ), abstract="Everything you ever wanted to know about nothing", body=expected_body, citations=[ GrobidBiblio( index=0, id="b0", authors=[ GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson") ], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", volume="20", pages="1-11", first_page="1", last_page="11", ), GrobidBiblio( index=1, id="b1", authors=[], date="2011-03-28", journal="The Dictionary", title="All about Facts", volume="14", note="author signed copy", ), ], ) assert doc == expected def test_small_xml_legacy() -> None: with open("tests/files/small.xml", "r") as f: tei_xml = f.read() with open("tests/files/small.json", "r") as f: json_form = json.loads(f.read()) d = parse_document_xml(tei_xml).to_legacy_dict() assert d == json_form def test_invalid_xml() -> None: with pytest.raises(xml.etree.ElementTree.ParseError): parse_document_xml("this is not XML") with pytest.raises(xml.etree.ElementTree.ParseError): parse_citations_xml("this is not XML") with pytest.raises(ValueError): parse_document_xml("") with pytest.raises(TypeError): parse_document_xml(123) # type: ignore def test_bytes() -> None: with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(tei_xml) parse_document_xml(io.BytesIO(tei_xml)) # type: ignore def test_elementtree() -> None: with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml))) # type: ignore def test_example_grobid_tei_xml() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() doc = parse_document_xml(blob) assert ( doc.header.title == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" ) ref = [c for c in doc.citations or [] if c.id == "b12"][0] assert ref.authors[0].full_name == "K Tasa" assert ref.authors[0].given_name == "K" assert ref.authors[0].surname == "Tasa" assert ref.journal == "Quality Management in Health Care" assert ref.title == "Using patient feedback for quality improvement" assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" assert ( ref.unstructured == """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" ) def test_single_citations_xml() -> None: citation_xml = """ Mesh migration following abdominal hernia repair: a comprehensive review H B Cunningham J J Weis L R Taveras S Huerta 10.1007/s10029-019-01898-9 30701369 Hernia 23 2 """ d = parse_citation_xml(citation_xml) assert d assert ( d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" ) assert d.authors[2].given_name == "L" assert d.authors[2].middle_name == "R" assert d.authors[2].surname == "Taveras" assert d.authors[2].full_name == "L R Taveras" assert d.doi == "10.1007/s10029-019-01898-9" assert d.pmid == "30701369" assert d.date == "2019-01-30" assert d.pages == "235-243" assert d.first_page == "235" assert d.last_page == "243" assert d.volume == "23" assert d.issue == "2" assert d.journal == "Hernia" d2 = parse_citations_xml(citation_xml)[0] assert d.title == d2.title assert d.authors == d2.authors def test_citation_list_xml() -> None: with open("tests/files/example_citation_list.xml", "r") as f: tei_xml = f.read() citations = parse_citation_list_xml(tei_xml) # verify that old function still works assert citations == parse_citations_xml(tei_xml) assert len(citations) == 13 assert citations[3].note == "The Research Handbook on International Environmental Law" assert citations[3].authors[0].surname == "Uhlířová" assert citations[3].authors[1].surname == "Drumbl" assert citations[3].editors assert citations[3].editors[0].surname == "Fitzmaurice" # TODO: multiple persName under a single (https://github.com/kermitt2/grobid/issues/845) # assert citations[3].editors[1].surname == "Brus" assert citations[4].authors[0].surname == "Sleytr" assert citations[4].authors[0].middle_name == "B" assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger" assert citations[10].doi == "10.1093/eurheartj/ehi890" assert citations[10].url is None assert citations[11].title == "Devices, Measurements and Properties" assert citations[11].series_title == "Handbook of Optics" assert citations[11].publisher == "McGRAW-HILL" assert ( citations[12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River" ) assert ( citations[12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association" ) assert citations[12].institution == "University of Minnesota" def test_grobid_070_document() -> None: # more recent GROBID v0.7.0 output with open("tests/files/example_grobid_plos.tei.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) assert doc.grobid_timestamp == "2021-10-23T03:05+0000" assert doc.grobid_version == "0.7.0-SNAPSHOT" assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5" assert doc.citations cite_b6 = doc.citations[6] assert cite_b6.id == "b6" assert cite_b6.journal == "OR. Hydrobiol" # note that this was not parsed well by GROBID assert cite_b6.institution == "Crater Lake National Park" assert cite_b6.date == "2007" assert cite_b6.volume == "574" assert cite_b6.issue == "1" cite_b3 = doc.citations[3] assert cite_b3.url == "http://unesdoc.unesco.org/ulis/" assert ( cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS" ) assert cite_b3.authors assert cite_b3.authors[0].surname == "Ioc-Unesco" assert cite_b3.date == "2012" cite_b18 = doc.citations[18] assert cite_b18.note == "TriOS GmbH [Internet" assert cite_b18.date == "2017-01-05" cite_b29 = doc.citations[29] assert cite_b29.note == "PhD dissertation" # run these methods over some more examples for c in doc.citations: c.to_csl_dict() c.to_dict() c.to_legacy_dict() def test_empty_citations() -> None: with open("tests/files/empty_citation_unstructured.tei.xml", "r") as f: mostly_empty_xml = f.read() with open("tests/files/empty_citation.tei.xml", "r") as f: empty_xml = f.read() assert parse_citation_xml(empty_xml) is None assert parse_citation_xml(mostly_empty_xml) is None d = parse_citation_list_xml(empty_xml) assert d assert d[0].index == 0 assert d[0].unstructured is None d2 = parse_citation_list_xml(mostly_empty_xml) assert d2 assert d2[0].index == 0 assert d2[0].unstructured == "blah"