diff options
-rwxr-xr-x | fuzzycat/grobid2json.py | 3 | ||||
-rw-r--r-- | fuzzycat/grobid_unstructured.py | 126 | ||||
-rw-r--r-- | tests/test_grobid_unstructured.py | 130 |
3 files changed, 258 insertions, 1 deletions
diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py index 49f265a..c5aa0d2 100755 --- a/fuzzycat/grobid2json.py +++ b/fuzzycat/grobid2json.py @@ -26,7 +26,8 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any] continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join(pn.itertext()) + full_name = " ".join(pn.itertext()).strip() + full_name = " ".join(full_name.split()) obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py new file mode 100644 index 0000000..4f09bce --- /dev/null +++ b/fuzzycat/grobid_unstructured.py @@ -0,0 +1,126 @@ +""" +Helper functions to parse an unstructured citation string using GROBID, then +fuzzy match using the result. + +- try to parse string with GROBID REST API call +- transform the GROBID XML response to a simple dict/struct + +TODO: more general versions which handle multiple reference strings in a batch? +""" + +import io +import sys +import xml.etree.ElementTree as ET +from typing import Any, Optional, Tuple + +import requests +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds + +from fuzzycat.config import settings +from fuzzycat.grobid2json import biblio_info + +GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") + + +def grobid_api_process_citation(raw_citation: str, + grobid_api_base: str = GROBID_API_BASE, + timeout: float = 20.0) -> Optional[str]: + """ + Process a single citation string using GROBID API, returning a TEI-XML response. + + Raises python TimeoutError if there was a network or request timeout. + + Raises a 'requests' error other unexpected failures (including network + connection failures) + """ + try: + grobid_response = requests.post( + grobid_api_base + "/api/processCitation", + data={ + "citations": raw_citation, + "consolidateCitations": 0, + }, + timeout=timeout, + ) + except requests.Timeout: + raise TimeoutError("GROBID request (HTTP POST) timeout") + + if grobid_response.status_code == 204: + return None + elif grobid_response.status_code != 200: + print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr) + grobid_response.raise_for_status() + + return grobid_response.text or None + + +def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: + """ + Parses GROBID XML for the case of a single reference/citation string (eg, + not a full/propper TEI-XML fulltext document), and returns a dict. + """ + # first, remove any xmlns stuff, for consistent parsign + raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") + tree = ET.parse(io.StringIO(raw_xml)) + root = tree.getroot() + ref = biblio_info(root, ns="") + if not any(ref.values()): + return None + return ref + + +def grobid_ref_to_release(ref: dict) -> ReleaseEntity: + """ + Takes the dict returned by transform_grobid_ref_xml() and returns a partial + ReleaseEntity object (for use with fuzzycat) + """ + contribs = [] + for author in ref.get("authors") or []: + contribs.append( + ReleaseContrib( + raw_name=author.get("name"), + given_name=author.get("given_name"), + surname=author.get("surname"), + )) + release = ReleaseEntity( + title=ref.get("title"), + contribs=contribs, + volume=ref.get("volume"), + issue=ref.get("issue"), + pages=ref.get("pages"), + ext_ids=ReleaseExtIds( + doi=ref.get("doi"), + pmid=ref.get("pmid"), + pmcid=ref.get("pmcid"), + arxiv=ref.get("arxiv_id"), + ), + ) + if ref.get("journal"): + release.extra = {"container_name": ref.get("journal")} + if ref.get("date"): + if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): + release.release_year = int(ref["date"][0:4]) + # TODO: try to parse 'date' into an ISO date format, and assign to release_date? + return release + + +def grobid_parse_unstructured(raw_citation: str, + grobid_api_base: str = GROBID_API_BASE, + timeout: float = 20.0) -> Optional[ReleaseEntity]: + """ + High-level wrapper to parse a raw citation string into a (partial) release + entity. + + Returns None if it fails to parse. + + Raises various exceptions on network or remote errors. + """ + ref_xml = grobid_api_process_citation(raw_citation, + grobid_api_base=grobid_api_base, + timeout=timeout) + if not ref_xml: + return None + biblio_dict = transform_grobid_ref_xml(ref_xml) + if not biblio_dict: + return None + return grobid_ref_to_release(biblio_dict) diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py new file mode 100644 index 0000000..dd69936 --- /dev/null +++ b/tests/test_grobid_unstructured.py @@ -0,0 +1,130 @@ +import pytest + +from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml + + +def test_grobid_ref_to_release(): + + d = { + 'title': + "some title", + 'doi': + '10.1234/5678', + 'journal': + 'some journal', + 'authors': [ + { + 'name': 'ahab sailor', + 'given_name': 'ahab', + 'surname': 'sailor' + }, + { + 'name': 'mary jane', + 'given_name': 'mary', + 'surname': 'jane' + }, + ], + } + r = grobid_ref_to_release(d) + assert r.title == d['title'] + assert r.ext_ids.doi == d['doi'] + assert r.extra['container_name'] == d['journal'] + assert r.contribs[0].surname == d['authors'][0]['surname'] + assert r.contribs[1].raw_name == d['authors'][1]['name'] + + +def test_transform_grobid_ref_xml(): + citation_xml = """ +<biblStruct > + <analytic> + <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">H</forename> + <forename type="middle">B</forename> + <surname>Cunningham</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">J</forename> + <forename type="middle">J</forename> + <surname>Weis</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">L</forename> + <forename type="middle">R</forename> + <surname>Taveras</surname> + </persName> + </author> + <author> + <persName + xmlns="http://www.tei-c.org/ns/1.0"> + <forename type="first">S</forename> + <surname>Huerta</surname> + </persName> + </author> + <idno type="DOI">10.1007/s10029-019-01898-9</idno> + <idno type="PMID">30701369</idno> + </analytic> + <monogr> + <title level="j">Hernia</title> + <imprint> + <biblScope unit="volume">23</biblScope> + <biblScope unit="issue">2</biblScope> + <biblScope unit="page" from="235" to="243" /> + <date type="published" when="2019-01-30" /> + </imprint> + </monogr> +</biblStruct>""" + + d = transform_grobid_ref_xml(citation_xml) + assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d['authors'][2]['given_name'] == "L" + assert d['authors'][2]['surname'] == "Taveras" + assert d['authors'][2]['name'] == "L R Taveras" + assert d['doi'] == "10.1007/s10029-019-01898-9" + assert d['pmid'] == "30701369" + assert d['date'] == "2019-01-30" + assert d['pages'] == "235-243" + assert d['volume'] == "23" + assert d['issue'] == "2" + assert d['journal'] == "Hernia" + + +def test_grobid_parse_unstructured(): + """ + NOTE: this test makes live network requests to GROBID + """ + + r = grobid_parse_unstructured("blah") + assert r is None + + r = grobid_parse_unstructured( + """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369.""" + ) + assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert r.contribs[0].surname == "Cunningham" + assert r.contribs[1].surname == "Weis" + assert r.contribs[2].surname == "Taveras" + assert r.contribs[3].surname == "Huerta" + assert r.extra['container_name'] == "Hernia" + assert r.release_year == 2019 + assert r.volume == "23" + assert r.issue == "2" + assert r.pages == "235-243" + assert r.ext_ids.doi == "10.1007/s10029-019-01898-9" + assert r.ext_ids.pmid == "30701369" + + +def test_grobid_parse_unstructured_timeout(): + """ + NOTE: this test makes live network requests to GROBID + """ + with pytest.raises(TimeoutError): + grobid_parse_unstructured("blah", timeout=0.000001) |