import xml import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation from grobid_tei_xml.types import * def test_teixml2json_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: json_form = json.loads(f.read()) assert teixml2json(tei_xml) == json_form assert parse_document_xml(tei_xml).to_dict() == json_form def test_teixml2json_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) expected = GrobidDocument( grobid_version='0.5.1-SNAPSHOT', grobid_timestamp='2018-04-02T00:31+0000', language_code='en', header=GrobidHeader( title="Dummy Example File", authors=[ GrobidAuthor( name="Brewster Kahle", given_name="Brewster", surname="Kahle", affiliation=GrobidAffiliation( department="Faculty ofAgricultrial Engineering", laboratory="Plant Physiology Laboratory", institution="Technion-Israel Institute of Technology", address=GrobidAddress( post_code="32000", settlement="Haifa", country="Israel", ), ) ), GrobidAuthor( name="J Doe", given_name="J", surname="Doe", ), ], journal=GrobidJournal( name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ), date="2000", ), abstract="Everything you ever wanted to know about nothing", body="Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", citations=[ GrobidCitation( index=0, id="b0", authors=[ GrobidAuthor( name="A Seaperson", given_name="A", surname="Seaperson" ) ], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", volume="20", pages="1-11", ), GrobidCitation( index=1, id="b1", authors=[], date="2011-03-28", journal="The Dictionary", title="All about Facts", volume="14", ), ], ) assert doc == expected def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): teixml2json("this is not XML") with pytest.raises(ValueError): teixml2json("") def test_grobid_teixml2json() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() obj = teixml2json(blob, True) assert ( obj["title"] == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) ref = [c for c in obj["citations"] if c["id"] == "b12"][0] assert ref["authors"][0] == { "given_name": "K", "name": "K Tasa", "surname": "Tasa" } assert ref["journal"] == "Quality Management in Health Care" assert ref["title"] == "Using patient feedback for quality improvement" assert ref["date"] == "1996" assert ref["pages"] == "206-225" assert ref["volume"] == "8" assert ( ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." )