From 567727e8606d2565098ddbcd63a1526aa44ff97f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 14 Apr 2021 15:40:15 -0700
Subject: GROBID API unstructured citation parsing utility code

---
 fuzzycat/grobid2json.py           |   3 +-
 fuzzycat/grobid_unstructured.py   | 126 ++++++++++++++++++++++++++++++++++++
 tests/test_grobid_unstructured.py | 130 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 fuzzycat/grobid_unstructured.py
 create mode 100644 tests/test_grobid_unstructured.py

diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py
index 49f265a..c5aa0d2 100755
--- a/fuzzycat/grobid2json.py
+++ b/fuzzycat/grobid2json.py
@@ -26,7 +26,8 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]
             continue
         given_name = pn.findtext("./{%s}forename" % ns) or None
         surname = pn.findtext("./{%s}surname" % ns) or None
-        full_name = " ".join(pn.itertext())
+        full_name = " ".join(pn.itertext()).strip()
+        full_name = " ".join(full_name.split())
         obj: Dict[str, Any] = dict(name=full_name)
         if given_name:
             obj["given_name"] = given_name
diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py
new file mode 100644
index 0000000..4f09bce
--- /dev/null
+++ b/fuzzycat/grobid_unstructured.py
@@ -0,0 +1,126 @@
+"""
+Helper functions to parse an unstructured citation string using GROBID, then
+fuzzy match using the result.
+
+- try to parse string with GROBID REST API call
+- transform the GROBID XML response to a simple dict/struct
+
+TODO: more general versions which handle multiple reference strings in a batch?
+"""
+
+import io
+import sys
+import xml.etree.ElementTree as ET
+from typing import Any, Optional, Tuple
+
+import requests
+from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
+
+from fuzzycat.config import settings
+from fuzzycat.grobid2json import biblio_info
+
+GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")
+
+
+def grobid_api_process_citation(raw_citation: str,
+                                grobid_api_base: str = GROBID_API_BASE,
+                                timeout: float = 20.0) -> Optional[str]:
+    """
+    Process a single citation string using GROBID API, returning a TEI-XML response.
+
+    Raises python TimeoutError if there was a network or request timeout.
+
+    Raises a 'requests' error other unexpected failures (including network
+    connection failures)
+    """
+    try:
+        grobid_response = requests.post(
+            grobid_api_base + "/api/processCitation",
+            data={
+                "citations": raw_citation,
+                "consolidateCitations": 0,
+            },
+            timeout=timeout,
+        )
+    except requests.Timeout:
+        raise TimeoutError("GROBID request (HTTP POST) timeout")
+
+    if grobid_response.status_code == 204:
+        return None
+    elif grobid_response.status_code != 200:
+        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
+    grobid_response.raise_for_status()
+
+    return grobid_response.text or None
+
+
+def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
+    """
+    Parses GROBID XML for the case of a single reference/citation string (eg,
+    not a full/propper TEI-XML fulltext document), and returns a dict.
+    """
+    # first, remove any xmlns stuff, for consistent parsign
+    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+    tree = ET.parse(io.StringIO(raw_xml))
+    root = tree.getroot()
+    ref = biblio_info(root, ns="")
+    if not any(ref.values()):
+        return None
+    return ref
+
+
+def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
+    """
+    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
+    ReleaseEntity object (for use with fuzzycat)
+    """
+    contribs = []
+    for author in ref.get("authors") or []:
+        contribs.append(
+            ReleaseContrib(
+                raw_name=author.get("name"),
+                given_name=author.get("given_name"),
+                surname=author.get("surname"),
+            ))
+    release = ReleaseEntity(
+        title=ref.get("title"),
+        contribs=contribs,
+        volume=ref.get("volume"),
+        issue=ref.get("issue"),
+        pages=ref.get("pages"),
+        ext_ids=ReleaseExtIds(
+            doi=ref.get("doi"),
+            pmid=ref.get("pmid"),
+            pmcid=ref.get("pmcid"),
+            arxiv=ref.get("arxiv_id"),
+        ),
+    )
+    if ref.get("journal"):
+        release.extra = {"container_name": ref.get("journal")}
+    if ref.get("date"):
+        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
+            release.release_year = int(ref["date"][0:4])
+        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
+    return release
+
+
+def grobid_parse_unstructured(raw_citation: str,
+                              grobid_api_base: str = GROBID_API_BASE,
+                              timeout: float = 20.0) -> Optional[ReleaseEntity]:
+    """
+    High-level wrapper to parse a raw citation string into a (partial) release
+    entity.
+    
+    Returns None if it fails to parse.
+
+    Raises various exceptions on network or remote errors.
+    """
+    ref_xml = grobid_api_process_citation(raw_citation,
+                                          grobid_api_base=grobid_api_base,
+                                          timeout=timeout)
+    if not ref_xml:
+        return None
+    biblio_dict = transform_grobid_ref_xml(ref_xml)
+    if not biblio_dict:
+        return None
+    return grobid_ref_to_release(biblio_dict)
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
new file mode 100644
index 0000000..dd69936
--- /dev/null
+++ b/tests/test_grobid_unstructured.py
@@ -0,0 +1,130 @@
+import pytest
+
+from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml
+
+
+def test_grobid_ref_to_release():
+
+    d = {
+        'title':
+        "some title",
+        'doi':
+        '10.1234/5678',
+        'journal':
+        'some journal',
+        'authors': [
+            {
+                'name': 'ahab sailor',
+                'given_name': 'ahab',
+                'surname': 'sailor'
+            },
+            {
+                'name': 'mary jane',
+                'given_name': 'mary',
+                'surname': 'jane'
+            },
+        ],
+    }
+    r = grobid_ref_to_release(d)
+    assert r.title == d['title']
+    assert r.ext_ids.doi == d['doi']
+    assert r.extra['container_name'] == d['journal']
+    assert r.contribs[0].surname == d['authors'][0]['surname']
+    assert r.contribs[1].raw_name == d['authors'][1]['name']
+
+
+def test_transform_grobid_ref_xml():
+    citation_xml = """
+<biblStruct >
+    <analytic>
+        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">H</forename>
+                <forename type="middle">B</forename>
+                <surname>Cunningham</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">J</forename>
+                <forename type="middle">J</forename>
+                <surname>Weis</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">L</forename>
+                <forename type="middle">R</forename>
+                <surname>Taveras</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">S</forename>
+                <surname>Huerta</surname>
+            </persName>
+        </author>
+        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
+        <idno type="PMID">30701369</idno>
+    </analytic>
+    <monogr>
+        <title level="j">Hernia</title>
+        <imprint>
+            <biblScope unit="volume">23</biblScope>
+            <biblScope unit="issue">2</biblScope>
+            <biblScope unit="page" from="235" to="243" />
+            <date type="published" when="2019-01-30" />
+        </imprint>
+    </monogr>
+</biblStruct>"""
+
+    d = transform_grobid_ref_xml(citation_xml)
+    assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert d['authors'][2]['given_name'] == "L"
+    assert d['authors'][2]['surname'] == "Taveras"
+    assert d['authors'][2]['name'] == "L R Taveras"
+    assert d['doi'] == "10.1007/s10029-019-01898-9"
+    assert d['pmid'] == "30701369"
+    assert d['date'] == "2019-01-30"
+    assert d['pages'] == "235-243"
+    assert d['volume'] == "23"
+    assert d['issue'] == "2"
+    assert d['journal'] == "Hernia"
+
+
+def test_grobid_parse_unstructured():
+    """
+    NOTE: this test makes live network requests to GROBID
+    """
+
+    r = grobid_parse_unstructured("blah")
+    assert r is None
+
+    r = grobid_parse_unstructured(
+        """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369."""
+    )
+    assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert r.contribs[0].surname == "Cunningham"
+    assert r.contribs[1].surname == "Weis"
+    assert r.contribs[2].surname == "Taveras"
+    assert r.contribs[3].surname == "Huerta"
+    assert r.extra['container_name'] == "Hernia"
+    assert r.release_year == 2019
+    assert r.volume == "23"
+    assert r.issue == "2"
+    assert r.pages == "235-243"
+    assert r.ext_ids.doi == "10.1007/s10029-019-01898-9"
+    assert r.ext_ids.pmid == "30701369"
+
+
+def test_grobid_parse_unstructured_timeout():
+    """
+    NOTE: this test makes live network requests to GROBID
+    """
+    with pytest.raises(TimeoutError):
+        grobid_parse_unstructured("blah", timeout=0.000001)
-- 
cgit v1.2.3