import io import json import xml import xml.etree.ElementTree import pytest from grobid_tei_xml import ( GrobidBiblio, GrobidDocument, parse_citation_list_xml, parse_citation_xml, parse_citations_xml, parse_document_xml, ) from grobid_tei_xml.types import * def test_small_xml() -> None: with open("tests/files/small.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) expected_body = """Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.""" expected = GrobidDocument( grobid_version="0.5.1-SNAPSHOT", grobid_timestamp="2018-04-02T00:31+0000", language_code="en", header=GrobidBiblio( title="Dummy Example File", authors=[ GrobidAuthor( full_name="Brewster Kahle", given_name="Brewster", surname="Kahle", affiliation=GrobidAffiliation( department="Faculty ofAgricultrial Engineering", laboratory="Plant Physiology Laboratory", institution="Technion-Israel Institute of Technology", address=GrobidAddress( post_code="32000", settlement="Haifa", country="Israel", ), ), ), GrobidAuthor( full_name="J Doe", given_name="J", surname="Doe", ), ], book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", date="2000", ), abstract="Everything you ever wanted to know about nothing", body=expected_body, citations=[ GrobidBiblio( index=0, id="b0", authors=[ GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson") ], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", volume="20", pages="1-11", first_page="1", last_page="11", ), GrobidBiblio( index=1, id="b1", authors=[], date="2011-03-28", journal="The Dictionary", title="All about Facts", volume="14", note="author signed copy", ), ], ) assert doc == expected def test_small_xml_legacy() -> None: with open("tests/files/small.xml", "r") as f: tei_xml = f.read() with open("tests/files/small.json", "r") as f: json_form = json.loads(f.read()) d = parse_document_xml(tei_xml).to_legacy_dict() assert d == json_form def test_invalid_xml() -> None: with pytest.raises(xml.etree.ElementTree.ParseError): parse_document_xml("this is not XML") with pytest.raises(xml.etree.ElementTree.ParseError): parse_citations_xml("this is not XML") with pytest.raises(ValueError): parse_document_xml("") with pytest.raises(TypeError): parse_document_xml(123) # type: ignore def test_bytes() -> None: with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(tei_xml) parse_document_xml(io.BytesIO(tei_xml)) # type: ignore def test_elementtree() -> None: with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml))) # type: ignore def test_example_grobid_tei_xml() -> None: with open("tests/files/document/example.tei.xml", "r") as f: blob = f.read() doc = parse_document_xml(blob) assert ( doc.header.title == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" ) ref = [c for c in doc.citations or [] if c.id == "b12"][0] assert ref.authors[0].full_name == "K Tasa" assert ref.authors[0].given_name == "K" assert ref.authors[0].surname == "Tasa" assert ref.journal == "Quality Management in Health Care" assert ref.title == "Using patient feedback for quality improvement" assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" assert ( ref.unstructured == """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" ) def test_single_citations_xml() -> None: citation_xml = """ Mesh migration following abdominal hernia repair: a comprehensive review H B Cunningham J J Weis L R Taveras S Huerta 10.1007/s10029-019-01898-9 30701369 Hernia 23 2 """ d = parse_citation_xml(citation_xml) assert d assert ( d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" ) assert d.authors[2].given_name == "L" assert d.authors[2].middle_name == "R" assert d.authors[2].surname == "Taveras" assert d.authors[2].full_name == "L R Taveras" assert d.doi == "10.1007/s10029-019-01898-9" assert d.pmid == "30701369" assert d.date == "2019-01-30" assert d.pages == "235-243" assert d.first_page == "235" assert d.last_page == "243" assert d.volume == "23" assert d.issue == "2" assert d.journal == "Hernia" d2 = parse_citations_xml(citation_xml)[0] assert d.title == d2.title assert d.authors == d2.authors def test_citation_list_xml() -> None: with open("tests/files/citation_list/example.tei.xml", "r") as f: tei_xml = f.read() citations = parse_citation_list_xml(tei_xml) # verify that old function still works assert citations == parse_citations_xml(tei_xml) assert len(citations) == 13 assert citations[3].note == "The Research Handbook on International Environmental Law" assert citations[3].authors[0].surname == "Uhlířová" assert citations[3].authors[1].surname == "Drumbl" assert citations[3].editors assert citations[3].editors[0].surname == "Fitzmaurice" # TODO: multiple persName under a single (https://github.com/kermitt2/grobid/issues/845) # assert citations[3].editors[1].surname == "Brus" assert citations[4].authors[0].surname == "Sleytr" assert citations[4].authors[0].middle_name == "B" assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger" assert citations[10].doi == "10.1093/eurheartj/ehi890" assert citations[10].url is None assert citations[11].title == "Devices, Measurements and Properties" assert citations[11].series_title == "Handbook of Optics" assert citations[11].publisher == "McGRAW-HILL" assert ( citations[12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River" ) assert ( citations[12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association" ) assert citations[12].institution == "University of Minnesota" def test_grobid_070_document() -> None: # more recent GROBID v0.7.0 output with open("tests/files/document/plos.tei.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) assert doc.grobid_timestamp == "2021-10-23T03:05+0000" assert doc.grobid_version == "0.7.0-SNAPSHOT" assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5" assert doc.citations cite_b6 = doc.citations[6] assert cite_b6.id == "b6" assert cite_b6.journal == "OR. Hydrobiol" # note that this was not parsed well by GROBID assert cite_b6.institution == "Crater Lake National Park" assert cite_b6.date == "2007" assert cite_b6.volume == "574" assert cite_b6.issue == "1" cite_b3 = doc.citations[3] assert cite_b3.url == "http://unesdoc.unesco.org/ulis/" assert ( cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS" ) assert cite_b3.authors assert cite_b3.authors[0].surname == "Ioc-Unesco" assert cite_b3.date == "2012" cite_b18 = doc.citations[18] assert cite_b18.note == "TriOS GmbH [Internet" assert cite_b18.date == "2017-01-05" cite_b29 = doc.citations[29] assert cite_b29.note == "PhD dissertation" # run these methods over some more examples for c in doc.citations: c.to_csl_dict() c.to_dict() c.to_legacy_dict() def test_empty_citations() -> None: with open("tests/files/citation/empty_unstructured.tei.xml", "r") as f: mostly_empty_xml = f.read() with open("tests/files/citation/empty.tei.xml", "r") as f: empty_xml = f.read() assert parse_citation_xml(empty_xml) is None assert parse_citation_xml(mostly_empty_xml) is None d = parse_citation_list_xml(empty_xml) assert d assert d[0].index == 0 assert d[0].unstructured is None d2 = parse_citation_list_xml(mostly_empty_xml) assert d2 assert d2[0].index == 0 assert d2[0].unstructured == "blah" def test_citation_emdash() -> None: with open("tests/files/citation/emdash.tei.xml", "rb") as f: tei_xml_bytes = f.read() with open("tests/files/citation/emdash.tei.xml", "r") as f2: tei_xml_str = f2.read() # that dash is a unicode emdash unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." assert unstructured[70:81] == "pp. 155\u2013172" assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes assert "pp. 155\u2013172" in tei_xml_str ref_bytes = parse_citation_xml(tei_xml_bytes) assert ref_bytes assert ref_bytes.unstructured == unstructured assert ref_bytes.first_page == "155" assert ref_bytes.pages == "155-172" ref_str = parse_citation_xml(tei_xml_str) assert ref_str assert ref_str.unstructured == unstructured assert ref_str.first_page == "155" assert ref_str.pages == "155-172" def test_citation_list_utf8() -> None: with open("tests/files/citation_list/emdash.tei.xml", "rb") as f: tei_xml_bytes = f.read() with open("tests/files/citation_list/emdash.tei.xml", "r") as f2: tei_xml_str = f2.read() unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." assert unstructured[70:81] == "pp. 155\u2013172" ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0] assert ref_bytes assert ref_bytes.unstructured == unstructured assert ref_bytes.first_page == "155" assert ref_bytes.pages == "155-172" ref_str = parse_citation_list_xml(tei_xml_str)[0] assert ref_str assert ref_str.unstructured == unstructured assert ref_str.first_page == "155" assert ref_str.pages == "155-172" def test_citation_multiple_editors() -> None: with open( "tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r" ) as f: tei_xml = f.read() ref = parse_citation_xml(tei_xml) assert ref assert ref.title == "Uterine cancer" assert len(ref.authors) == 1 assert ref.authors[0].full_name == "J R Lurain" assert ref.authors[0].middle_name == "R" assert ref.authors[0].surname == "Lurain" assert ref.editors assert len(ref.editors) == 3 assert ref.editors[0].full_name == "J S Berek" assert ref.editors[1].full_name == "E Y Adashi" assert ref.editors[2].full_name == "P A Hillard" assert ref.book_title == "Novak’s gynecology" assert ref.publisher == "Williams and Wilkins" assert ref.date == "1996" assert ref.note == "12th ed. Baltimore" def test_author_email() -> None: with open("tests/files/document/author_email.tei.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) biblio = doc.header assert biblio assert biblio.title == "Task-Based Intelligent Retrieval and Recommendation" assert biblio.authors assert biblio.authors[0].given_name == "Chirag" assert biblio.authors[0].surname == "Shah" assert biblio.authors[0].email == "redacted@example.com" assert biblio.authors[0].affiliation assert biblio.authors[0].affiliation.institution == "University of Washington" assert biblio.authors[0].affiliation.address assert biblio.authors[0].affiliation.address.settlement == "Seattle" assert biblio.authors[0].affiliation.address.country == "USA" assert doc.pdf_md5 == "6C18173427FE3FAD756BB2F4F7665855" assert doc.grobid_version == "0.7.1-SNAPSHOT" assert doc.grobid_timestamp == "2021-11-02T09:03+0000" assert doc.language_code == "en" assert doc.abstract assert doc.abstract[:50] == "While the act of looking for information happens within a"[:50] assert doc.citations == [] assert doc.body is None assert doc.acknowledgement is None assert doc.annex is None