diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 18:22:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 18:22:12 -0700 |
commit | 2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch) | |
tree | 6de17ab8a3f77053c4f61770011af4b7de2c4a17 /tests | |
parent | 8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff) | |
download | grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip |
more progress
Diffstat (limited to 'tests')
-rw-r--r-- | tests/files/example_citation_list.xml | 278 | ||||
-rw-r--r-- | tests/test_grobid2json.py | 143 | ||||
-rw-r--r-- | tests/test_grobid_unstructured.py | 68 | ||||
-rw-r--r-- | tests/test_parse.py | 213 |
4 files changed, 558 insertions, 144 deletions
diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml new file mode 100644 index 0000000..d640393 --- /dev/null +++ b/tests/files/example_citation_list.xml @@ -0,0 +1,278 @@ +<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML"> + <teiHeader/> + <text> + <front/> + <body/> + <back> + <div> + <listBibl> +<biblStruct xml:id="b0"> + <analytic> + <title level="a" type="main">E-commerce: the challenge for UK SMEs in the twenty-first century</title> + <author> + <persName><forename type="first">M</forename><surname>Quayle</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">International Journal of Operations and Production Management</title> + <imprint> + <biblScope unit="volume">22</biblScope> + <biblScope unit="issue">10</biblScope> + <biblScope unit="page" from="1148" to="1161" /> + <date type="published" when="2002" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b1"> + <analytic> + <title level="a" type="main">Evolution, challenges and path forward for low temperature combustion engines</title> + <author> + <persName><forename type="first">A</forename><forename type="middle">K</forename><surname>Agarwal</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">P</forename><surname>Singh</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">K</forename><surname>Maurya</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Progress in Energy and Combustion Science</title> + <imprint> + <biblScope unit="volume">61</biblScope> + <biblScope unit="page" from="1" to="56" /> + <date type="published" when="2017" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b2"> + <analytic> + <title level="a" type="main">Thrombotic complications of central venous cath- eters in cancer patients</title> + <author> + <persName><forename type="first">D</forename><forename type="middle">J</forename><surname>Kutter</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Oncologist</title> + <imprint> + <biblScope unit="volume">9</biblScope> + <biblScope unit="page" from="207" to="216" /> + <date type="published" when="2004" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b3"> + <monogr> + <author> + <persName><forename type="first">K</forename><surname>Uhlířová</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Drumbl</surname></persName> + </author> + <title level="m">Actors and Law Making in International Environmental Law</title> + <editor> + <persName><forename type="first">M</forename><surname>Fitzmaurice</surname></persName> + <persName><forename type="first">M</forename><surname>Brus</surname></persName> + <persName><forename type="first">P</forename><surname>Merkouris</surname></persName> + </editor> + <meeting><address><addrLine>Cheltenham</addrLine></address></meeting> + <imprint> + <publisher>Edward Elgar Publishing</publisher> + <date type="published" when="2020" /> + <biblScope unit="volume">50</biblScope> + </imprint> + </monogr> + <note>The Research Handbook on International Environmental Law</note> +</biblStruct> + +<biblStruct xml:id="b4"> + <analytic> + <title level="a" type="main">Self as- sembly protein systems: Microbial S-layers</title> + <author> + <persName><forename type="first">U</forename><forename type="middle">B</forename><surname>Sleytr</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Sára</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Pum</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Schuster</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>Messner</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Schäffer</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Biopolymers</title> + <editor>A. Steinbüchel and S. Fahnestock</editor> + <imprint> + <biblScope unit="volume">7</biblScope> + <biblScope unit="page" from="285" to="338" /> + <date type="published" when="2003" /> + <publisher>Wiley-VCH</publisher> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b5"> + <analytic> + <title level="a" type="main">Self-illuminating quantum dot conjugates for in vivo imaging</title> + <author> + <persName><forename type="first">M-K</forename><surname>So</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Xu</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Loening</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Gambhir</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Rao</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Nat Biotech</title> + <imprint> + <biblScope unit="volume">24</biblScope> + <biblScope unit="page" from="339" to="343" /> + <date type="published" when="2006" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b6"> + <analytic> + <title level="a" type="main">Informed conditioning on clinical covariates increases power in case-control association studies</title> + <author> + <persName><forename type="first">N</forename><surname>Zaitlen</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Lindström</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Pasaniuc</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Cornelis</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Genovese</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Pollack</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><forename type="middle">I</forename><surname>Freedman</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">PLoS genetics</title> + <imprint> + <biblScope unit="volume">8</biblScope> + <biblScope unit="issue">11</biblScope> + <biblScope unit="page">e1003032</biblScope> + <date type="published" when="2012" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b7"> + <monogr> + <author> + <persName><forename type="first">K</forename><surname>Von Grebmer</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Saltzman</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Birol</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Wiesmann</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Prasai</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Yin</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Yohannes</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>Menon</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Thompson</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Sonntag</surname></persName> + </author> + <title level="m">Global Hunger Index: The Challenge of Hidden Hunger</title> + <imprint> + <date type="published" when="2014" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b8"> + <analytic> + <title level="a" type="main">Isolation and characterization of serum-resistant strains ofPseudomonas aeruginosa derived from serum-sensitive parental strains</title> + <author> + <persName><forename type="first">N</forename><forename type="middle">L</forename><surname>Schiller</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><forename type="middle">R</forename><surname>Hackley</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Morrison</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Curr Microbiol</title> + <imprint> + <biblScope unit="volume">10</biblScope> + <biblScope unit="page" from="185" to="190" /> + <date type="published" when="1984" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b9"> + <analytic> + <title level="a" type="main">Importin 7 and importin alpha/importin beta are nuclear import receptors for the glucocorticoid receptor</title> + <author> + <persName><forename type="first">N</forename><forename type="middle">D</forename><surname>Freedman</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Yamamoto</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Mol Biol Cell</title> + <imprint> + <biblScope unit="volume">15</biblScope> + <biblScope unit="page" from="2276" to="2286" /> + <date type="published" when="2004" /> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> + diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index ed5d996..a1c975e 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -3,10 +3,10 @@ import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation -from grobid_tei_xml.types import * +from grobid_tei_xml.grobid2json import transform_grobid_ref_xml -def test_teixml2json_small_xml(): +def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -15,80 +15,6 @@ def test_teixml2json_small_xml(): assert teixml2json(tei_xml) == json_form - assert parse_document_xml(tei_xml).to_dict() == json_form - -def test_teixml2json_small_xml(): - - with open('tests/files/small.xml', 'r') as f: - tei_xml = f.read() - - doc = parse_document_xml(tei_xml) - expected = GrobidDocument( - grobid_version='0.5.1-SNAPSHOT', - grobid_timestamp='2018-04-02T00:31+0000', - language_code='en', - header=GrobidHeader( - title="Dummy Example File", - authors=[ - GrobidAuthor( - name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - ) - ), - GrobidAuthor( - name="J Doe", - given_name="J", - surname="Doe", - ), - ], - journal=GrobidJournal( - name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", - ), - date="2000", - ), - abstract="Everything you ever wanted to know about nothing", - body="Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", - citations=[ - GrobidCitation( - index=0, - id="b0", - authors=[ - GrobidAuthor( - name="A Seaperson", - given_name="A", - surname="Seaperson" - ) - ], - date="2001", - journal="Letters in the Alphabet", - title="Everything is Wonderful", - volume="20", - pages="1-11", - ), - GrobidCitation( - index=1, - id="b1", - authors=[], - date="2011-03-28", - journal="The Dictionary", - title="All about Facts", - volume="14", - ), - ], - ) - - assert doc == expected - def test_invalid_xml(): @@ -125,3 +51,68 @@ def test_grobid_teixml2json() -> None: ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_transform_grobid_ref_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = transform_grobid_ref_xml(citation_xml) + assert d[ + 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py deleted file mode 100644 index 91b7398..0000000 --- a/tests/test_grobid_unstructured.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from grobid_tei_xml.grobid2json import transform_grobid_ref_xml - - -def test_transform_grobid_ref_xml(): - citation_xml = """ -<biblStruct > - <analytic> - <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">H</forename> - <forename type="middle">B</forename> - <surname>Cunningham</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">J</forename> - <forename type="middle">J</forename> - <surname>Weis</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">L</forename> - <forename type="middle">R</forename> - <surname>Taveras</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">S</forename> - <surname>Huerta</surname> - </persName> - </author> - <idno type="DOI">10.1007/s10029-019-01898-9</idno> - <idno type="PMID">30701369</idno> - </analytic> - <monogr> - <title level="j">Hernia</title> - <imprint> - <biblScope unit="volume">23</biblScope> - <biblScope unit="issue">2</biblScope> - <biblScope unit="page" from="235" to="243" /> - <date type="published" when="2019-01-30" /> - </imprint> - </monogr> -</biblStruct>""" - - d = transform_grobid_ref_xml(citation_xml) - assert d[ - 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" - assert d['authors'][2]['given_name'] == "L" - assert d['authors'][2]['surname'] == "Taveras" - assert d['authors'][2]['name'] == "L R Taveras" - assert d['doi'] == "10.1007/s10029-019-01898-9" - assert d['pmid'] == "30701369" - assert d['date'] == "2019-01-30" - assert d['pages'] == "235-243" - assert d['volume'] == "23" - assert d['issue'] == "2" - assert d['journal'] == "Hernia" diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("<xml></xml>") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" |