From 2bf52b0622005ed8a7c51e59faa9873600d9cb5f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Oct 2021 18:22:12 -0700 Subject: more progress --- grobid_tei_xml/__init__.py | 4 +- grobid_tei_xml/__main__.py | 18 ++- grobid_tei_xml/grobid2json.py | 1 + grobid_tei_xml/grobid_unstructured.py | 1 - grobid_tei_xml/parse.py | 34 ++++- grobid_tei_xml/types.py | 22 ++- tests/files/example_citation_list.xml | 278 ++++++++++++++++++++++++++++++++++ tests/test_grobid2json.py | 143 ++++++++--------- tests/test_grobid_unstructured.py | 68 --------- tests/test_parse.py | 213 ++++++++++++++++++++++++++ 10 files changed, 619 insertions(+), 163 deletions(-) create mode 100644 tests/files/example_citation_list.xml delete mode 100644 tests/test_grobid_unstructured.py create mode 100644 tests/test_parse.py diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py index bf8a133..d7d4ada 100644 --- a/grobid_tei_xml/__init__.py +++ b/grobid_tei_xml/__init__.py @@ -1,5 +1,5 @@ __version__ = "0.1.0" -from .types import GrobidDocument, GrobidCitation -from .parse import parse_document_xml, parse_citations_xml from .grobid2json import teixml2json +from .parse import parse_citations_xml, parse_document_xml +from .types import GrobidCitation, GrobidDocument diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py index 489bd4e..2d10e84 100644 --- a/grobid_tei_xml/__main__.py +++ b/grobid_tei_xml/__main__.py @@ -1,5 +1,8 @@ +import argparse +import json + +from . import parse_document_xml -from .parse import parse_article def main() -> None: # pragma no cover parser = argparse.ArgumentParser( @@ -19,11 +22,14 @@ def main() -> None: # pragma no cover for filename in args.teifiles: content = open(filename, "r").read() - print( - json.dumps( - parse_article(content, encumbered=(not args.no_encumbered)), - sort_keys=True, - )) + doc = parse_document_xml(content) + if args.no_encumbered: + doc.body = None + doc.annex = None + doc.acknowledgements = None + doc.abstract = None + print(json.dumps(doc.to_dict(), sort_keys=True)) + if __name__ == "__main__": # pragma no cover main() diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index c005b31..3c56b19 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -215,6 +215,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info.pop(k) return info + def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: """ Parses GROBID XML for the case of a single reference/citation string (eg, diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index bdead05..cbf7322 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -14,4 +14,3 @@ import xml.etree.ElementTree as ET from typing import Optional from .parse import biblio_info - diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index a239e4d..32c5d0f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,4 +1,3 @@ - import io import json import xml.etree.ElementTree as ET @@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET: elif isinstance(content, ET): return content else: - raise TypeError(f"expected XML as string or bytes, got: {type(content)}") + raise TypeError( + f"expected XML as string or bytes, got: {type(content)}") + -def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[GrobidAffiliation]: if not elem: return [] names = [] @@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: names.append(GrobidAuthor(**obj)) return names + def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") @@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: else: ref["journal"] = None ref["title"] = other_title - ref["authors"] = _parse_authors(elem) + ref["authors"] = _parse_authors(elem, ns=ns) ref["publisher"] = elem.findtext( f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: @@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["url"] = None return GrobidCitation(**ref) + def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal["abbrev"] = None return GrobidJournal(**journal) + def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem info = dict() @@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: info["doi"] = info["doi"].lower() return GrobidHeader(**info) + def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed @@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: tree = _string_to_tree(xml_text) tei = tree.getroot() info = dict() - encumbered = True header = tei.find(f".//{{{ns}}}teiHeader") if header is None: @@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") doc.body = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join( - el.itertext()).strip() + doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') doc.annex = (el or None) and " ".join(el.itertext()).strip() return doc + def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. Eg, the output of '/api/processReferences' or '/api/processCitation'. """ + # XXX: this replacement shouldn't be needed? + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) + root = tree.getroot() + + if root.tag == 'biblStruct': + ref = _parse_citation(root, ns='') + ref.index = 0 + return [ref] + + refs = [] + for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + ref = _parse_citation(bs, ns='') + ref.index = i + refs.append(ref) + return refs diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 795d37f..aabe424 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -1,6 +1,5 @@ - +from dataclasses import asdict, dataclass from typing import Any, AnyStr, Dict, List, Optional -from dataclasses import dataclass @dataclass @@ -11,6 +10,7 @@ class GrobidAddress: country: Optional[str] = None country_code: Optional[str] = None + @dataclass class GrobidAffiliation: address: Optional[GrobidAddress] = None @@ -18,6 +18,7 @@ class GrobidAffiliation: department: Optional[str] = None laboratory: Optional[str] = None + @dataclass class GrobidAuthor: name: Optional[str] @@ -26,6 +27,7 @@ class GrobidAuthor: surname: Optional[str] = None affiliation: Optional[dict] = None + @dataclass class GrobidCitation: authors: List[GrobidAuthor] @@ -52,6 +54,7 @@ class GrobidCitation: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + @dataclass class GrobidJournal: name: Optional[str] = None @@ -62,6 +65,7 @@ class GrobidJournal: issn: Optional[str] = None eissn: Optional[str] = None + @dataclass class GrobidHeader: title: Optional[str] = None @@ -71,6 +75,7 @@ class GrobidHeader: #TODO: note: Optional[str] journal: Optional[GrobidJournal] = None + @dataclass class GrobidDocument: grobid_version: str @@ -87,10 +92,21 @@ class GrobidDocument: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def _simplify_dict(d: dict) -> dict: + """ + Recursively remove empty dict values from a dict and all sub-lists and + sub-dicts. + """ + if d in [None, {}, '']: + return None for k in list(d.keys()): if isinstance(d[k], dict): d[k] = _simplify_dict(d[k]) - if d[k] in [None, [], {}, '']: + elif isinstance(d[k], list): + for i in range(len(d[k])): + if isinstance(d[k][i], dict): + d[k][i] = _simplify_dict(d[k][i]) + if d[k] in [None, {}, '']: d.pop(k) return d diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml new file mode 100644 index 0000000..d640393 --- /dev/null +++ b/tests/files/example_citation_list.xml @@ -0,0 +1,278 @@ + + + + + + +
+ + + + E-commerce: the challenge for UK SMEs in the twenty-first century + + MQuayle + + + + International Journal of Operations and Production Management + + 22 + 10 + + + + + + + + + Evolution, challenges and path forward for low temperature combustion engines + + AKAgarwal + + + APSingh + + + RKMaurya + + + + Progress in Energy and Combustion Science + + 61 + + + + + + + + + Thrombotic complications of central venous cath- eters in cancer patients + + DJKutter + + + + Oncologist + + 9 + + + + + + + + + + KUhlířová + + + MDrumbl + + Actors and Law Making in International Environmental Law + + MFitzmaurice + MBrus + PMerkouris + +
Cheltenham
+ + Edward Elgar Publishing + + 50 + +
+ The Research Handbook on International Environmental Law +
+ + + + Self as- sembly protein systems: Microbial S-layers + + UBSleytr + + + MSára + + + DPum + + + BSchuster + + + PMessner + + + CSchäffer + + + + Biopolymers + A. Steinbüchel and S. Fahnestock + + 7 + + + Wiley-VCH + + + + + + + Self-illuminating quantum dot conjugates for in vivo imaging + + M-KSo + + + CXu + + + AMLoening + + + SSGambhir + + + JRao + + + + Nat Biotech + + 24 + + + + + + + + + Informed conditioning on clinical covariates increases power in case-control association studies + + NZaitlen + + + SLindström + + + BPasaniuc + + + MCornelis + + + GGenovese + + + SPollack + + + BIFreedman + + + + PLoS genetics + + 8 + 11 + e1003032 + + + + + + + + + KVon Grebmer + + + ASaltzman + + + EBirol + + + DWiesmann + + + NPrasai + + + SYin + + + YYohannes + + + PMenon + + + JThompson + + + ASonntag + + Global Hunger Index: The Challenge of Hidden Hunger + + + + + + + + + Isolation and characterization of serum-resistant strains ofPseudomonas aeruginosa derived from serum-sensitive parental strains + + NLSchiller + + + DRHackley + + + AMorrison + + + + Curr Microbiol + + 10 + + + + + + + + + Importin 7 and importin alpha/importin beta are nuclear import receptors for the glucocorticoid receptor + + NDFreedman + + + KRYamamoto + + + + Mol Biol Cell + + 15 + + + + + + +
+
+
+
+
+ diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index ed5d996..a1c975e 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -3,10 +3,10 @@ import json import pytest from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation -from grobid_tei_xml.types import * +from grobid_tei_xml.grobid2json import transform_grobid_ref_xml -def test_teixml2json_small_xml(): +def test_small_xml(): with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -15,80 +15,6 @@ def test_teixml2json_small_xml(): assert teixml2json(tei_xml) == json_form - assert parse_document_xml(tei_xml).to_dict() == json_form - -def test_teixml2json_small_xml(): - - with open('tests/files/small.xml', 'r') as f: - tei_xml = f.read() - - doc = parse_document_xml(tei_xml) - expected = GrobidDocument( - grobid_version='0.5.1-SNAPSHOT', - grobid_timestamp='2018-04-02T00:31+0000', - language_code='en', - header=GrobidHeader( - title="Dummy Example File", - authors=[ - GrobidAuthor( - name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - ) - ), - GrobidAuthor( - name="J Doe", - given_name="J", - surname="Doe", - ), - ], - journal=GrobidJournal( - name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", - ), - date="2000", - ), - abstract="Everything you ever wanted to know about nothing", - body="Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", - citations=[ - GrobidCitation( - index=0, - id="b0", - authors=[ - GrobidAuthor( - name="A Seaperson", - given_name="A", - surname="Seaperson" - ) - ], - date="2001", - journal="Letters in the Alphabet", - title="Everything is Wonderful", - volume="20", - pages="1-11", - ), - GrobidCitation( - index=1, - id="b1", - authors=[], - date="2011-03-28", - journal="The Dictionary", - title="All about Facts", - volume="14", - ), - ], - ) - - assert doc == expected - def test_invalid_xml(): @@ -125,3 +51,68 @@ def test_grobid_teixml2json() -> None: ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_transform_grobid_ref_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = transform_grobid_ref_xml(citation_xml) + assert d[ + 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py deleted file mode 100644 index 91b7398..0000000 --- a/tests/test_grobid_unstructured.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from grobid_tei_xml.grobid2json import transform_grobid_ref_xml - - -def test_transform_grobid_ref_xml(): - citation_xml = """ - - - Mesh migration following abdominal hernia repair: a comprehensive review - - - H - B - Cunningham - - - - - J - J - Weis - - - - - L - R - Taveras - - - - - S - Huerta - - - 10.1007/s10029-019-01898-9 - 30701369 - - - Hernia - - 23 - 2 - - - - -""" - - d = transform_grobid_ref_xml(citation_xml) - assert d[ - 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" - assert d['authors'][2]['given_name'] == "L" - assert d['authors'][2]['surname'] == "Taveras" - assert d['authors'][2]['name'] == "L R Taveras" - assert d['doi'] == "10.1007/s10029-019-01898-9" - assert d['pmid'] == "30701369" - assert d['date'] == "2019-01-30" - assert d['pages'] == "235-243" - assert d['volume'] == "23" - assert d['issue'] == "2" - assert d['journal'] == "Hernia" diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..e79d41d --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,213 @@ +import xml +import json +import pytest + +from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml.types import * + + +def test_small_xml(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + expected = GrobidDocument( + grobid_version='0.5.1-SNAPSHOT', + grobid_timestamp='2018-04-02T00:31+0000', + language_code='en', + header=GrobidHeader( + title="Dummy Example File", + authors=[ + GrobidAuthor( + name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), + GrobidAuthor( + name="J Doe", + given_name="J", + surname="Doe", + ), + ], + journal=GrobidJournal( + name= + "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + ), + date="2000", + ), + abstract="Everything you ever wanted to know about nothing", + body= + "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + citations=[ + GrobidCitation( + index=0, + id="b0", + authors=[ + GrobidAuthor(name="A Seaperson", + given_name="A", + surname="Seaperson") + ], + date="2001", + journal="Letters in the Alphabet", + title="Everything is Wonderful", + volume="20", + pages="1-11", + ), + GrobidCitation( + index=1, + id="b1", + authors=[], + date="2011-03-28", + journal="The Dictionary", + title="All about Facts", + volume="14", + ), + ], + ) + + assert doc == expected + + +def test_small_xml_json(): + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + with open('tests/files/small.json', 'r') as f: + json_form = json.loads(f.read()) + + d = parse_document_xml(tei_xml).to_dict() + + # munge back to the old JSON format + d.update(d.pop('header')) + addr = d['authors'][0]['affiliation']['address'] + addr['postCode'] = addr.pop('post_code') + + # remove nulls from old JSON + for c in json_form['citations']: + for k in list(c.keys()): + if c[k] == None: + c.pop(k) + + assert d == json_form + + +def test_invalid_xml(): + + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_document_xml("this is not XML") + with pytest.raises(xml.etree.ElementTree.ParseError): + parse_citations_xml("this is not XML") + with pytest.raises(ValueError): + parse_document_xml("") + + +def test_example_grobid_tei_xml() -> None: + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].given_name == "K" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) + + +def test_single_citations_xml(): + citation_xml = """ + + + Mesh migration following abdominal hernia repair: a comprehensive review + + + H + B + Cunningham + + + + + J + J + Weis + + + + + L + R + Taveras + + + + + S + Huerta + + + 10.1007/s10029-019-01898-9 + 30701369 + + + Hernia + + 23 + 2 + + + + +""" + + d = parse_citations_xml(citation_xml)[0] + assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.authors[2].given_name == "L" + assert d.authors[2].surname == "Taveras" + assert d.authors[2].name == "L R Taveras" + assert d.doi == "10.1007/s10029-019-01898-9" + assert d.pmid == "30701369" + assert d.date == "2019-01-30" + assert d.pages == "235-243" + assert d.volume == "23" + assert d.issue == "2" + assert d.journal == "Hernia" + + +def test_citation_list_xml(): + + with open('tests/files/example_citation_list.xml', 'r') as f: + tei_xml = f.read() + + citations = parse_citations_xml(tei_xml) + assert len(citations) == 10 + assert citations[ + 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" -- cgit v1.2.3