import xml import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation from grobid_tei_xml.grobid2json import transform_grobid_ref_xml def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: json_form = json.loads(f.read()) assert teixml2json(tei_xml) == json_form def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): teixml2json("this is not XML") with pytest.raises(ValueError): teixml2json("") def test_grobid_teixml2json() -> None: with open("tests/files/example_grobid.tei.xml", "r") as f: blob = f.read() obj = teixml2json(blob, True) assert ( obj["title"] == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) ref = [c for c in obj["citations"] if c["id"] == "b12"][0] assert ref["authors"][0] == { "given_name": "K", "name": "K Tasa", "surname": "Tasa" } assert ref["journal"] == "Quality Management in Health Care" assert ref["title"] == "Using patient feedback for quality improvement" assert ref["date"] == "1996" assert ref["pages"] == "206-225" assert ref["volume"] == "8" assert ( ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) def test_transform_grobid_ref_xml(): citation_xml = """ Mesh migration following abdominal hernia repair: a comprehensive review H B Cunningham J J Weis L R Taveras S Huerta 10.1007/s10029-019-01898-9 30701369 Hernia 23 2 """ d = transform_grobid_ref_xml(citation_xml) assert d[ 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" assert d['authors'][2]['given_name'] == "L" assert d['authors'][2]['surname'] == "Taveras" assert d['authors'][2]['name'] == "L R Taveras" assert d['doi'] == "10.1007/s10029-019-01898-9" assert d['pmid'] == "30701369" assert d['date'] == "2019-01-30" assert d['pages'] == "235-243" assert d['volume'] == "23" assert d['issue'] == "2" assert d['journal'] == "Hernia"