Mesh migration following abdominal hernia repair: a comprehensive review

import io
import json
import xml
import xml.etree.ElementTree

import pytest

from grobid_tei_xml import (
    GrobidBiblio,
    GrobidDocument,
    parse_citation_list_xml,
    parse_citation_xml,
    parse_citations_xml,
    parse_document_xml,
)
from grobid_tei_xml.types import *


def test_small_xml() -> None:

    with open("tests/files/small.xml", "r") as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    expected_body = """Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED."""
    expected = GrobidDocument(
        grobid_version="0.5.1-SNAPSHOT",
        grobid_timestamp="2018-04-02T00:31+0000",
        language_code="en",
        header=GrobidBiblio(
            title="Dummy Example File",
            authors=[
                GrobidAuthor(
                    full_name="Brewster Kahle",
                    given_name="Brewster",
                    surname="Kahle",
                    affiliation=GrobidAffiliation(
                        department="Faculty ofAgricultrial Engineering",
                        laboratory="Plant Physiology Laboratory",
                        institution="Technion-Israel Institute of Technology",
                        address=GrobidAddress(
                            post_code="32000",
                            settlement="Haifa",
                            country="Israel",
                        ),
                    ),
                ),
                GrobidAuthor(
                    full_name="J Doe",
                    given_name="J",
                    surname="Doe",
                ),
            ],
            book_title="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
            date="2000",
        ),
        abstract="Everything you ever wanted to know about nothing",
        body=expected_body,
        citations=[
            GrobidBiblio(
                index=0,
                id="b0",
                authors=[
                    GrobidAuthor(full_name="A Seaperson", middle_name="A", surname="Seaperson")
                ],
                date="2001",
                journal="Letters in the Alphabet",
                title="Everything is Wonderful",
                volume="20",
                pages="1-11",
                first_page="1",
                last_page="11",
            ),
            GrobidBiblio(
                index=1,
                id="b1",
                authors=[],
                date="2011-03-28",
                journal="The Dictionary",
                title="All about Facts",
                volume="14",
                note="author signed copy",
            ),
        ],
    )

    assert doc == expected


def test_small_xml_legacy() -> None:

    with open("tests/files/small.xml", "r") as f:
        tei_xml = f.read()
    with open("tests/files/small.json", "r") as f:
        json_form = json.loads(f.read())

    d = parse_document_xml(tei_xml).to_legacy_dict()

    assert d == json_form


def test_invalid_xml() -> None:

    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_document_xml("this is not XML")
    with pytest.raises(xml.etree.ElementTree.ParseError):
        parse_citations_xml("this is not XML")
    with pytest.raises(ValueError):
        parse_document_xml("<xml></xml>")
    with pytest.raises(TypeError):
        parse_document_xml(123)  # type: ignore


def test_bytes() -> None:

    with open("tests/files/small.xml", "rb") as f:
        tei_xml = f.read()

    parse_document_xml(tei_xml)
    parse_document_xml(io.BytesIO(tei_xml))  # type: ignore


def test_elementtree() -> None:

    with open("tests/files/small.xml", "rb") as f:
        tei_xml = f.read()

    parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml)))  # type: ignore


def test_example_grobid_tei_xml() -> None:

    with open("tests/files/example_grobid.tei.xml", "r") as f:
        blob = f.read()

    doc = parse_document_xml(blob)

    assert (
        doc.header.title
        == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""
    )

    ref = [c for c in doc.citations or [] if c.id == "b12"][0]
    assert ref.authors[0].full_name == "K Tasa"
    assert ref.authors[0].given_name == "K"
    assert ref.authors[0].surname == "Tasa"
    assert ref.journal == "Quality Management in Health Care"
    assert ref.title == "Using patient feedback for quality improvement"
    assert ref.date == "1996"
    assert ref.pages == "206-225"
    assert ref.volume == "8"
    assert (
        ref.unstructured
        == """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."""
    )


def test_single_citations_xml() -> None:
    citation_xml = """
<biblStruct >
    <analytic>
        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">H</forename>
                <forename type="middle">B</forename>
                <surname>Cunningham</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">J</forename>
                <forename type="middle">J</forename>
                <surname>Weis</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">L</forename>
                <forename type="middle">R</forename>
                <surname>Taveras</surname>
            </persName>
        </author>
        <author>
            <persName
                xmlns="http://www.tei-c.org/ns/1.0">
                <forename type="first">S</forename>
                <surname>Huerta</surname>
            </persName>
        </author>
        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
        <idno type="PMID">30701369</idno>
    </analytic>
    <monogr>
        <title level="j">Hernia</title>
        <imprint>
            <biblScope unit="volume">23</biblScope>
            <biblScope unit="issue">2</biblScope>
            <biblScope unit="page" from="235" to="243" />
            <date type="published" when="2019-01-30" />
        </imprint>
    </monogr>
</biblStruct>"""

    d = parse_citation_xml(citation_xml)
    assert d
    assert (
        d.title
        == """Mesh migration following abdominal hernia repair: a comprehensive review"""
    )
    assert d.authors[2].given_name == "L"
    assert d.authors[2].middle_name == "R"
    assert d.authors[2].surname == "Taveras"
    assert d.authors[2].full_name == "L R Taveras"
    assert d.doi == "10.1007/s10029-019-01898-9"
    assert d.pmid == "30701369"
    assert d.date == "2019-01-30"
    assert d.pages == "235-243"
    assert d.first_page == "235"
    assert d.last_page == "243"
    assert d.volume == "23"
    assert d.issue == "2"
    assert d.journal == "Hernia"

    d2 = parse_citations_xml(citation_xml)[0]
    assert d.title == d2.title
    assert d.authors == d2.authors


def test_citation_list_xml() -> None:

    with open("tests/files/example_citation_list.xml", "r") as f:
        tei_xml = f.read()

    citations = parse_citation_list_xml(tei_xml)

    # verify that old function still works
    assert citations == parse_citations_xml(tei_xml)

    assert len(citations) == 13

    assert citations[3].note == "The Research Handbook on International Environmental Law"
    assert citations[3].authors[0].surname == "Uhlířová"
    assert citations[3].authors[1].surname == "Drumbl"
    assert citations[3].editors
    assert citations[3].editors[0].surname == "Fitzmaurice"
    # TODO: multiple persName under a single <editor> (https://github.com/kermitt2/grobid/issues/845)
    # assert citations[3].editors[1].surname == "Brus"

    assert citations[4].authors[0].surname == "Sleytr"
    assert citations[4].authors[0].middle_name == "B"

    assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"

    assert citations[10].doi == "10.1093/eurheartj/ehi890"
    assert citations[10].url is None

    assert citations[11].title == "Devices, Measurements and Properties"
    assert citations[11].series_title == "Handbook of Optics"
    assert citations[11].publisher == "McGRAW-HILL"

    assert (
        citations[12].title
        == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River"
    )
    assert (
        citations[12].book_title
        == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association"
    )
    assert citations[12].institution == "University of Minnesota"


def test_grobid_070_document() -> None:
    # more recent GROBID v0.7.0 output

    with open("tests/files/example_grobid_plos.tei.xml", "r") as f:
        tei_xml = f.read()

    doc = parse_document_xml(tei_xml)
    assert doc.grobid_timestamp == "2021-10-23T03:05+0000"
    assert doc.grobid_version == "0.7.0-SNAPSHOT"
    assert doc.pdf_md5 == "4F10689DEB84756CE82C8015951A22E5"

    assert doc.citations
    cite_b6 = doc.citations[6]
    assert cite_b6.id == "b6"
    assert cite_b6.journal == "OR. Hydrobiol"
    # note that this was not parsed well by GROBID
    assert cite_b6.institution == "Crater Lake National Park"
    assert cite_b6.date == "2007"
    assert cite_b6.volume == "574"
    assert cite_b6.issue == "1"

    cite_b3 = doc.citations[3]
    assert cite_b3.url == "http://unesdoc.unesco.org/ulis/"
    assert (
        cite_b3.title
        == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS"
    )
    assert cite_b3.authors
    assert cite_b3.authors[0].surname == "Ioc-Unesco"
    assert cite_b3.date == "2012"

    cite_b18 = doc.citations[18]
    assert cite_b18.note == "TriOS GmbH [Internet"
    assert cite_b18.date == "2017-01-05"

    cite_b29 = doc.citations[29]
    assert cite_b29.note == "PhD dissertation"

    # run these methods over some more examples
    for c in doc.citations:
        c.to_csl_dict()
        c.to_dict()
        c.to_legacy_dict()


def test_empty_citations() -> None:

    with open("tests/files/empty_citation_unstructured.tei.xml", "r") as f:
        mostly_empty_xml = f.read()

    with open("tests/files/empty_citation.tei.xml", "r") as f:
        empty_xml = f.read()

    assert parse_citation_xml(empty_xml) is None
    assert parse_citation_xml(mostly_empty_xml) is None

    d = parse_citation_list_xml(empty_xml)
    assert d
    assert d[0].index == 0
    assert d[0].unstructured is None

    d2 = parse_citation_list_xml(mostly_empty_xml)
    assert d2
    assert d2[0].index == 0
    assert d2[0].unstructured == "blah"