diff options
-rw-r--r-- | grobid_tei_xml/__init__.py | 4 | ||||
-rw-r--r-- | grobid_tei_xml/__main__.py | 18 | ||||
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 1 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 1 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 34 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 22 | ||||
-rw-r--r-- | tests/files/example_citation_list.xml | 278 | ||||
-rw-r--r-- | tests/test_grobid2json.py | 143 | ||||
-rw-r--r-- | tests/test_grobid_unstructured.py | 68 | ||||
-rw-r--r-- | tests/test_parse.py | 213 |
10 files changed, 619 insertions, 163 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py index bf8a133..d7d4ada 100644 --- a/grobid_tei_xml/__init__.py +++ b/grobid_tei_xml/__init__.py @@ -1,5 +1,5 @@ __version__ = "0.1.0" -from .types import GrobidDocument, GrobidCitation -from .parse import parse_document_xml, parse_citations_xml from .grobid2json import teixml2json +from .parse import parse_citations_xml, parse_document_xml +from .types import GrobidCitation, GrobidDocument diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py index 489bd4e..2d10e84 100644 --- a/grobid_tei_xml/__main__.py +++ b/grobid_tei_xml/__main__.py @@ -1,5 +1,8 @@ +import argparse +import json + +from . import parse_document_xml -from .parse import parse_article def main() -> None: # pragma no cover parser = argparse.ArgumentParser( @@ -19,11 +22,14 @@ def main() -> None: # pragma no cover for filename in args.teifiles: content = open(filename, "r").read() - print( - json.dumps( - parse_article(content, encumbered=(not args.no_encumbered)), - sort_keys=True, - )) + doc = parse_document_xml(content) + if args.no_encumbered: + doc.body = None + doc.annex = None + doc.acknowledgements = None + doc.abstract = None + print(json.dumps(doc.to_dict(), sort_keys=True)) + if __name__ == "__main__": # pragma no cover main() diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index c005b31..3c56b19 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -215,6 +215,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info.pop(k) return info + def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: """ Parses GROBID XML for the case of a single reference/citation string (eg, diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index bdead05..cbf7322 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -14,4 +14,3 @@ import xml.etree.ElementTree as ET from typing import Optional from .parse import biblio_info - diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index a239e4d..32c5d0f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,4 +1,3 @@ - import io import json import xml.etree.ElementTree as ET @@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET: elif isinstance(content, ET): return content else: - raise TypeError(f"expected XML as string or bytes, got: {type(content)}") + raise TypeError( + f"expected XML as string or bytes, got: {type(content)}") + -def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[GrobidAffiliation]: if not elem: return [] names = [] @@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: names.append(GrobidAuthor(**obj)) return names + def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") @@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: else: ref["journal"] = None ref["title"] = other_title - ref["authors"] = _parse_authors(elem) + ref["authors"] = _parse_authors(elem, ns=ns) ref["publisher"] = elem.findtext( f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: @@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["url"] = None return GrobidCitation(**ref) + def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal["abbrev"] = None return GrobidJournal(**journal) + def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem info = dict() @@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: info["doi"] = info["doi"].lower() return GrobidHeader(**info) + def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed @@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: tree = _string_to_tree(xml_text) tei = tree.getroot() info = dict() - encumbered = True header = tei.find(f".//{{{ns}}}teiHeader") if header is None: @@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") doc.body = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join( - el.itertext()).strip() + doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') doc.annex = (el or None) and " ".join(el.itertext()).strip() return doc + def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. Eg, the output of '/api/processReferences' or '/api/processCitation'. """ + # XXX: this replacement shouldn't be needed? + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) + root = tree.getroot() + + if root.tag == 'biblStruct': + ref = _parse_citation(root, ns='') + ref.index = 0 + return [ref] + + refs = [] + for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + ref = _parse_citation(bs, ns='') + ref.index = i + refs.append(ref) + return refs diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 795d37f..aabe424 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -1,6 +1,5 @@ - +from dataclasses import asdict, dataclass from typing import Any, AnyStr, Dict, List, Optional -from dataclasses import dataclass @dataclass @@ -11,6 +10,7 @@ class GrobidAddress: country: Optional[str] = None country_code: Optional[str] = None + @dataclass class GrobidAffiliation: address: Optional[GrobidAddress] = None @@ -18,6 +18,7 @@ class GrobidAffiliation: department: Optional[str] = None laboratory: Optional[str] = None + @dataclass class GrobidAuthor: name: Optional[str] @@ -26,6 +27,7 @@ class GrobidAuthor: surname: Optional[str] = None affiliation: Optional[dict] = None + @dataclass class GrobidCitation: authors: List[GrobidAuthor] @@ -52,6 +54,7 @@ class GrobidCitation: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + @dataclass class GrobidJournal: name: Optional[str] = None @@ -62,6 +65,7 @@ class GrobidJournal: issn: Optional[str] = None eissn: Optional[str] = None + @dataclass class GrobidHeader: title: Optional[str] = None @@ -71,6 +75,7 @@ class GrobidHeader: #TODO: note: Optional[str] journal: Optional[GrobidJournal] = None + @dataclass class GrobidDocument: grobid_version: str @@ -87,10 +92,21 @@ class GrobidDocument: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def _simplify_dict(d: dict) -> dict: + """ + Recursively remove empty dict values from a dict and all sub-lists and + sub-dicts. + """ + if d in [None, {}, '']: + return None for k in list(d.keys()): if isinstance(d[k], dict): d[k] = _simplify_dict(d[k]) - if d[k] in [None, [], {}, '']: + elif isinstance(d[k], list): + for i in range(len(d[k])): + if isinstance(d[k][i], dict): + d[k][i] = _simplify_dict(d[k][i]) + if d[k] in [None, {}, '']: d.pop(k) return d diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml new file mode 100644 index 0000000..d640393 --- /dev/null +++ b/tests/files/example_citation_list.xml @@ -0,0 +1,278 @@ +<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML"> + <teiHeader/> + <text> + <front/> + <body/> + <back> + <div> + <listBibl> +<biblStruct xml:id="b0"> + <analytic> + <title level="a" type="main">E-commerce: the challenge for UK SMEs in the twenty-first century</title> + <author> + <persName><forename type="first">M</forename><surname>Quayle</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">International Journal of Operations and Production Management</title> + <imprint> + <biblScope unit="volume">22</biblScope> + <biblScope unit="issue">10</biblScope> + <biblScope unit="page" from="1148" to="1161" /> + <date type="published" when="2002" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b1"> + <analytic> + <title level="a" type="main">Evolution, challenges and path forward for low temperature combustion engines</title> + <author> + <persName><forename type="first">A</forename><forename type="middle">K</forename><surname>Agarwal</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">P</forename><surname>Singh</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">K</forename><surname>Maurya</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Progress in Energy and Combustion Science</title> + <imprint> + <biblScope unit="volume">61</biblScope> + <biblScope unit="page" from="1" to="56" /> + <date type="published" when="2017" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b2"> + <analytic> + <title level="a" type="main">Thrombotic complications of central venous cath- eters in cancer patients</title> + <author> + <persName><forename type="first">D</forename><forename type="middle">J</forename><surname>Kutter</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Oncologist</title> + <imprint> + <biblScope unit="volume">9</biblScope> + <biblScope unit="page" from="207" to="216" /> + <date type="published" when="2004" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b3"> + <monogr> + <author> + <persName><forename type="first">K</forename><surname>Uhlířová</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Drumbl</surname></persName> + </author> + <title level="m">Actors and Law Making in International Environmental Law</title> + <editor> + <persName><forename type="first">M</forename><surname>Fitzmaurice</surname></persName> + <persName><forename type="first">M</forename><surname>Brus</surname></persName> + <persName><forename type="first">P</forename><surname>Merkouris</surname></persName> + </editor> + <meeting><address><addrLine>Cheltenham</addrLine></address></meeting> + <imprint> + <publisher>Edward Elgar Publishing</publisher> + <date type="published" when="2020" /> + <biblScope unit="volume">50</biblScope> + </imprint> + </monogr> + <note>The Research Handbook on International Environmental Law</note> +</biblStruct> + +<biblStruct xml:id="b4"> + <analytic> + <title level="a" type="main">Self as- sembly protein systems: Microbial S-layers</title> + <author> + <persName><forename type="first">U</forename><forename type="middle">B</forename><surname>Sleytr</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Sára</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Pum</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Schuster</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>Messner</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Schäffer</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Biopolymers</title> + <editor>A. Steinbüchel and S. Fahnestock</editor> + <imprint> + <biblScope unit="volume">7</biblScope> + <biblScope unit="page" from="285" to="338" /> + <date type="published" when="2003" /> + <publisher>Wiley-VCH</publisher> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b5"> + <analytic> + <title level="a" type="main">Self-illuminating quantum dot conjugates for in vivo imaging</title> + <author> + <persName><forename type="first">M-K</forename><surname>So</surname></persName> + </author> + <author> + <persName><forename type="first">C</forename><surname>Xu</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Loening</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Gambhir</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Rao</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Nat Biotech</title> + <imprint> + <biblScope unit="volume">24</biblScope> + <biblScope unit="page" from="339" to="343" /> + <date type="published" when="2006" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b6"> + <analytic> + <title level="a" type="main">Informed conditioning on clinical covariates increases power in case-control association studies</title> + <author> + <persName><forename type="first">N</forename><surname>Zaitlen</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Lindström</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><surname>Pasaniuc</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Cornelis</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Genovese</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Pollack</surname></persName> + </author> + <author> + <persName><forename type="first">B</forename><forename type="middle">I</forename><surname>Freedman</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">PLoS genetics</title> + <imprint> + <biblScope unit="volume">8</biblScope> + <biblScope unit="issue">11</biblScope> + <biblScope unit="page">e1003032</biblScope> + <date type="published" when="2012" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b7"> + <monogr> + <author> + <persName><forename type="first">K</forename><surname>Von Grebmer</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Saltzman</surname></persName> + </author> + <author> + <persName><forename type="first">E</forename><surname>Birol</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><surname>Wiesmann</surname></persName> + </author> + <author> + <persName><forename type="first">N</forename><surname>Prasai</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Yin</surname></persName> + </author> + <author> + <persName><forename type="first">Y</forename><surname>Yohannes</surname></persName> + </author> + <author> + <persName><forename type="first">P</forename><surname>Menon</surname></persName> + </author> + <author> + <persName><forename type="first">J</forename><surname>Thompson</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Sonntag</surname></persName> + </author> + <title level="m">Global Hunger Index: The Challenge of Hidden Hunger</title> + <imprint> + <date type="published" when="2014" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b8"> + <analytic> + <title level="a" type="main">Isolation and characterization of serum-resistant strains ofPseudomonas aeruginosa derived from serum-sensitive parental strains</title> + <author> + <persName><forename type="first">N</forename><forename type="middle">L</forename><surname>Schiller</surname></persName> + </author> + <author> + <persName><forename type="first">D</forename><forename type="middle">R</forename><surname>Hackley</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Morrison</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Curr Microbiol</title> + <imprint> + <biblScope unit="volume">10</biblScope> + <biblScope unit="page" from="185" to="190" /> + <date type="published" when="1984" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b9"> + <analytic> + <title level="a" type="main">Importin 7 and importin alpha/importin beta are nuclear import receptors for the glucocorticoid receptor</title> + <author> + <persName><forename type="first">N</forename><forename type="middle">D</forename><surname>Freedman</surname></persName> + </author> + <author> + <persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Yamamoto</surname></persName> + </author> + </analytic> + <monogr> + <title level="j">Mol Biol Cell</title> + <imprint> + <biblScope unit="volume">15</biblScope> + <biblScope unit="page" from="2276" to="2286" /> + <date type="published" when="2004" /> + </imprint> + </monogr> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> + diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index ed5d996..a1c975e 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -3,10 +3,10 @@ import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation -from grobid_tei_xml.types import * +from grobid_tei_xml.grobid2json import transform_grobid_ref_xml -def test_teixml2json_small_xml(): +def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -15,80 +15,6 @@ def test_teixml2json_small_xml(): assert teixml2json(tei_xml) == json_form - assert parse_document_xml(tei_xml).to_dict() == json_form - -def test_teixml2json_small_xml(): - - with open('tests/files/small.xml', 'r') as f: - tei_xml = f.read() - - doc = parse_document_xml(tei_xml) - expected = GrobidDocument( - grobid_version='0.5.1-SNAPSHOT', - grobid_timestamp='2018-04-02T00:31+0000', - language_code='en', - header=GrobidHeader( - title="Dummy Example File", - authors=[ - GrobidAuthor( - name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - ) - ), - GrobidAuthor( - name="J Doe", - given_name="J", - surname="Doe", - ), - ], - journal=GrobidJournal( - name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", - ), - date="2000", - ), - abstract="Everything you ever wanted to know about nothing", - body="Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", - citations=[ - GrobidCitation( - index=0, - id="b0", - authors=[ - GrobidAuthor( - name="A Seaperson", - given_name="A", - surname="Seaperson" - ) - ], - date="2001", - journal="Letters in the Alphabet", - title="Everything is Wonderful", - volume="20", - pages="1-11", - ), - GrobidCitation( - index=1, - id="b1", - authors=[], - date="2011-03-28", - journal="The Dictionary", - title="All about Facts", - volume="14", - ), - ], - ) - - assert doc == expected - def test_invalid_xml(): @@ -125,3 +51,68 @@ def test_grobid_teixml2json() -> None: ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_transform_grobid_ref_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = transform_grobid_ref_xml(citation_xml) + assert d[ + 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py deleted file mode 100644 index 91b7398..0000000 --- a/tests/test_grobid_unstructured.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from grobid_tei_xml.grobid2json import transform_grobid_ref_xml - - -def test_transform_grobid_ref_xml(): - citation_xml = """ -<biblStruct > - <analytic> - <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">H</forename> - <forename type="middle">B</forename> - <surname>Cunningham</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">J</forename> - <forename type="middle">J</forename> - <surname>Weis</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">L</forename> - <forename type="middle">R</forename> - <surname>Taveras</surname> - </persName> - </author> - <author> - <persName - xmlns="http://www.tei-c.org/ns/1.0"> - <forename type="first">S</forename> - <surname>Huerta</surname> - </persName> - </author> - <idno type="DOI">10.1007/s10029-019-01898-9</idno> - <idno type="PMID">30701369</idno> - </analytic> - <monogr> - <title level="j">Hernia</title> - <imprint> - <biblScope unit="volume">23</biblScope> - <biblScope unit="issue">2</biblScope> - <biblScope unit="page" from="235" to="243" /> - <date type="published" when="2019-01-30" /> - </imprint> - </monogr> -</biblStruct>""" - - d = transform_grobid_ref_xml(citation_xml) - assert d[ - 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" - assert d['authors'][2]['given_name'] == "L" - assert d['authors'][2]['surname'] == "Taveras" - assert d['authors'][2]['name'] == "L R Taveras" - assert d['doi'] == "10.1007/s10029-019-01898-9" - assert d['pmid'] == "30701369" - assert d['date'] == "2019-01-30" - assert d['pages'] == "235-243" - assert d['volume'] == "23" - assert d['issue'] == "2" - assert d['journal'] == "Hernia" diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("<xml></xml>") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" |