import xml import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation from grobid_tei_xml.types import * def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) expected = GrobidDocument( grobid_version='0.5.1-SNAPSHOT', grobid_timestamp='2018-04-02T00:31+0000', language_code='en', header=GrobidHeader( title="Dummy Example File", authors=[ GrobidAuthor( name="Brewster Kahle", given_name="Brewster", surname="Kahle", affiliation=GrobidAffiliation( department="Faculty ofAgricultrial Engineering", laboratory="Plant Physiology Laboratory", institution="Technion-Israel Institute of Technology", address=GrobidAddress( post_code="32000", settlement="Haifa", country="Israel", ), )), GrobidAuthor( name="J Doe", given_name="J", surname="Doe", ), ], journal=GrobidJournal( name= "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ), date="2000", ), abstract="Everything you ever wanted to know about nothing", body= "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", citations=[ GrobidCitation( index=0, id="b0", authors=[ GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson") ], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", volume="20", pages="1-11", ), GrobidCitation( index=1, id="b1", authors=[], date="2011-03-28", journal="The Dictionary", title="All about Facts", volume="14", ), ], ) assert doc == expected def test_small_xml_json(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: json_form = json.loads(f.read()) d = parse_document_xml(tei_xml).to_dict() # munge back to the old JSON format d.update(d.pop('header')) addr = d['authors'][0]['affiliation']['address'] addr['postCode'] = addr.pop('post_code') # remove nulls from old JSON for c in json_form['citations']: for k in list(c.keys()): if c[k] == None: c.pop(k) assert d == json_form def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): parse_document_xml("this is not XML") with pytest.raises(xml.etree.ElementTree.ParseError): parse_citations_xml("this is not XML") with pytest.raises(ValueError): parse_document_xml("") def test_example_grobid_tei_xml() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() doc = parse_document_xml(blob) assert ( doc.header.title == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) ref = [c for c in doc.citations if c.id == "b12"][0] assert ref.authors[0].name == "K Tasa" assert ref.authors[0].given_name == "K" assert ref.authors[0].surname == "Tasa" assert ref.journal == "Quality Management in Health Care" assert ref.title == "Using patient feedback for quality improvement" assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" assert ( ref.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) def test_single_citations_xml(): citation_xml = """ Mesh migration following abdominal hernia repair: a comprehensive review H B Cunningham J J Weis L R Taveras S Huerta 10.1007/s10029-019-01898-9 30701369 Hernia 23 2 """ d = parse_citations_xml(citation_xml)[0] assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" assert d.authors[2].given_name == "L" assert d.authors[2].surname == "Taveras" assert d.authors[2].name == "L R Taveras" assert d.doi == "10.1007/s10029-019-01898-9" assert d.pmid == "30701369" assert d.date == "2019-01-30" assert d.pages == "235-243" assert d.volume == "23" assert d.issue == "2" assert d.journal == "Hernia" def test_citation_list_xml(): with open('tests/files/example_citation_list.xml', 'r') as f: tei_xml = f.read() citations = parse_citations_xml(tei_xml) assert len(citations) == 10 assert citations[ 7].title == "Global Hunger Index: The Challenge of Hidden Hunger"