From 2bf52b0622005ed8a7c51e59faa9873600d9cb5f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Oct 2021 18:22:12 -0700 Subject: more progress --- tests/files/example_citation_list.xml | 278 ++++++++++++++++++++++++++++++++++ tests/test_grobid2json.py | 143 ++++++++--------- tests/test_grobid_unstructured.py | 68 --------- tests/test_parse.py | 213 ++++++++++++++++++++++++++ 4 files changed, 558 insertions(+), 144 deletions(-) create mode 100644 tests/files/example_citation_list.xml delete mode 100644 tests/test_grobid_unstructured.py create mode 100644 tests/test_parse.py (limited to 'tests') diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml new file mode 100644 index 0000000..d640393 --- /dev/null +++ b/tests/files/example_citation_list.xml @@ -0,0 +1,278 @@ + + + + + + +
+ + + + E-commerce: the challenge for UK SMEs in the twenty-first century + + MQuayle + + + + International Journal of Operations and Production Management + + 22 + 10 + + + + + + + + + Evolution, challenges and path forward for low temperature combustion engines + + AKAgarwal + + + APSingh + + + RKMaurya + + + + Progress in Energy and Combustion Science + + 61 + + + + + + + + + Thrombotic complications of central venous cath- eters in cancer patients + + DJKutter + + + + Oncologist + + 9 + + + + + + + + + + KUhlířová + + + MDrumbl + + Actors and Law Making in International Environmental Law + + MFitzmaurice + MBrus + PMerkouris + +
Cheltenham
+ + Edward Elgar Publishing + + 50 + +
+ The Research Handbook on International Environmental Law +
+ + + + Self as- sembly protein systems: Microbial S-layers + + UBSleytr + + + MSára + + + DPum + + + BSchuster + + + PMessner + + + CSchäffer + + + + Biopolymers + A. Steinbüchel and S. Fahnestock + + 7 + + + Wiley-VCH + + + + + + + Self-illuminating quantum dot conjugates for in vivo imaging + + M-KSo + + + CXu + + + AMLoening + + + SSGambhir + + + JRao + + + + Nat Biotech + + 24 + + + + + + + + + Informed conditioning on clinical covariates increases power in case-control association studies + + NZaitlen + + + SLindström + + + BPasaniuc + + + MCornelis + + + GGenovese + + + SPollack + + + BIFreedman + + + + PLoS genetics + + 8 + 11 + e1003032 + + + + + + + + + KVon Grebmer + + + ASaltzman + + + EBirol + + + DWiesmann + + + NPrasai + + + SYin + + + YYohannes + + + PMenon + + + JThompson + + + ASonntag + + Global Hunger Index: The Challenge of Hidden Hunger + + + + + + + + + Isolation and characterization of serum-resistant strains ofPseudomonas aeruginosa derived from serum-sensitive parental strains + + NLSchiller + + + DRHackley + + + AMorrison + + + + Curr Microbiol + + 10 + + + + + + + + + Importin 7 and importin alpha/importin beta are nuclear import receptors for the glucocorticoid receptor + + NDFreedman + + + KRYamamoto + + + + Mol Biol Cell + + 15 + + + + + + +
+
+
+
+
+ diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index ed5d996..a1c975e 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -3,10 +3,10 @@ import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation -from grobid_tei_xml.types import * +from grobid_tei_xml.grobid2json import transform_grobid_ref_xml -def test_teixml2json_small_xml(): +def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -15,80 +15,6 @@ def test_teixml2json_small_xml(): assert teixml2json(tei_xml) == json_form - assert parse_document_xml(tei_xml).to_dict() == json_form - -def test_teixml2json_small_xml(): - - with open('tests/files/small.xml', 'r') as f: - tei_xml = f.read() - - doc = parse_document_xml(tei_xml) - expected = GrobidDocument( - grobid_version='0.5.1-SNAPSHOT', - grobid_timestamp='2018-04-02T00:31+0000', - language_code='en', - header=GrobidHeader( - title="Dummy Example File", - authors=[ - GrobidAuthor( - name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - ) - ), - GrobidAuthor( - name="J Doe", - given_name="J", - surname="Doe", - ), - ], - journal=GrobidJournal( - name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", - ), - date="2000", - ), - abstract="Everything you ever wanted to know about nothing", - body="Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", - citations=[ - GrobidCitation( - index=0, - id="b0", - authors=[ - GrobidAuthor( - name="A Seaperson", - given_name="A", - surname="Seaperson" - ) - ], - date="2001", - journal="Letters in the Alphabet", - title="Everything is Wonderful", - volume="20", - pages="1-11", - ), - GrobidCitation( - index=1, - id="b1", - authors=[], - date="2011-03-28", - journal="The Dictionary", - title="All about Facts", - volume="14", - ), - ], - ) - - assert doc == expected - def test_invalid_xml(): @@ -125,3 +51,68 @@ def test_grobid_teixml2json() -> None: ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_transform_grobid_ref_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = transform_grobid_ref_xml(citation_xml) + assert d[ + 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py deleted file mode 100644 index 91b7398..0000000 --- a/tests/test_grobid_unstructured.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from grobid_tei_xml.grobid2json import transform_grobid_ref_xml - - -def test_transform_grobid_ref_xml(): - citation_xml = """ - - - Mesh migration following abdominal hernia repair: a comprehensive review - - - H - B - Cunningham - - - - - J - J - Weis - - - - - L - R - Taveras - - - - - S - Huerta - - - 10.1007/s10029-019-01898-9 - 30701369 - - - Hernia - - 23 - 2 - - - - -""" - - d = transform_grobid_ref_xml(citation_xml) - assert d[ - 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" - assert d['authors'][2]['given_name'] == "L" - assert d['authors'][2]['surname'] == "Taveras" - assert d['authors'][2]['name'] == "L R Taveras" - assert d['doi'] == "10.1007/s10029-019-01898-9" - assert d['pmid'] == "30701369" - assert d['date'] == "2019-01-30" - assert d['pages'] == "235-243" - assert d['volume'] == "23" - assert d['issue'] == "2" - assert d['journal'] == "Hernia" diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" -- cgit v1.2.3