citation querying helpers (GROBID, fuzzycat, etc)

author: Bryan Newbold <bnewbold@archive.org> 2021-01-18 23:49:34 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-01-19 19:49:04 -0800
commit: 38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f (patch)
tree: 3bbbf2802ec50e71a5d752984cd2aa667d311094 /fatcat_scholar/query_citation.py
parent: b938dc6ba8c354d483d96988dc057565db91ca38 (diff)
download: fatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.tar.gz
fatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.zip
1 files changed, 175 insertions, 0 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py
new file mode 100644
index 0000000..970c235
--- /dev/null
+++ b/fatcat_scholar/query_citation.py
@@ -0,0 +1,175 @@
+"""
+This file contains helpers to fuzzy match a raw citation string:
+
+- try to parse it with GROBID into structured form
+- transform the GROBID XML response to a simple dict/struct
+- run fuzzycat lookup
+
+Note that this chain hits several external services, and should be wrapped in a
+timeout and try/except! In the future, perhaps should be async so it can run in
+parallel with "regular" query?
+"""
+
+import io
+import sys
+from typing import Optional, Any, Tuple
+import xml.etree.ElementTree as ET
+
+import requests
+from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds, ReleaseContrib
+from fatcat_scholar.api_entities import entity_to_dict
+from fuzzycat.matching import match_release_fuzzy
+import fuzzycat.common
+import fuzzycat.verify
+
+from fatcat_scholar.grobid2json import biblio_info
+
+
+def grobid_process_citation(raw: str) -> Optional[str]:
+    GROBID_URL = "https://grobid.qa.fatcat.wiki"
+    try:
+        grobid_response = requests.post(
+            GROBID_URL + "/api/processCitation",
+            data={"citations": raw, "consolidateCitations": 0,},
+            timeout=180.0,
+        )
+    except requests.Timeout:
+        print("GROBID request (HTTP POST) timeout", file=sys.stderr)
+        return None
+    if grobid_response.status_code != 200:
+        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
+        return None
+    return grobid_response.text
+
+
+def transform_grobid(raw_xml: str) -> Optional[dict]:
+    # first, remove any xmlns stuff
+    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+    tree = ET.parse(io.StringIO(raw_xml))
+    root = tree.getroot()
+    ref = biblio_info(root, ns="")
+    return ref
+
+
+def ref_to_release(ref: dict) -> ReleaseEntity:
+    contribs = []
+    for author in ref.get("authors") or []:
+        contribs.append(
+            ReleaseContrib(
+                raw_name=author.get("name"),
+                given_name=author.get("given_name"),
+                surname=author.get("surname"),
+            )
+        )
+    release = ReleaseEntity(
+        title=ref.get("title"),
+        contribs=contribs,
+        volume=ref.get("volume"),
+        issue=ref.get("issue"),
+        pages=ref.get("pages"),
+        ext_ids=ReleaseExtIds(
+            doi=ref.get("doi"),
+            pmid=ref.get("pmid"),
+            pmcid=ref.get("pmcid"),
+            arxiv=ref.get("arxiv_id"),
+        ),
+    )
+    if ref.get("journal"):
+        release.extra = {"container_name": ref.get("journal")}
+    if ref.get("date"):
+        if len(ref["date"]) == 4 and ref["date"].isdigit():
+            release.release_year = int(ref["date"])
+    return release
+
+
+def fuzzy_match(
+    release: ReleaseEntity, es_client: Any, api_client: Any
+) -> Optional[Tuple[str, str, ReleaseEntity]]:
+    """
+    This helper function uses fuzzycat (and elasticsearch) to look for
+    existing release entities with similar metadata.
+
+    Returns None if there was no match of any kind, or a single tuple
+    (status: str, reason: str, existing: ReleaseEntity) if there was a match.
+
+    Status string is one of the fuzzycat.common.Status, with "strongest
+    match" in this sorted order:
+
+    - EXACT
+    - STRONG
+    - WEAK
+    - AMBIGUOUS
+
+    Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
+    result is only returned if all the candidate matches were ambiguous.
+    """
+
+    # this map used to establish priority order of verified matches
+    STATUS_SORT = {
+        fuzzycat.common.Status.TODO: 0,
+        fuzzycat.common.Status.EXACT: 10,
+        fuzzycat.common.Status.STRONG: 20,
+        fuzzycat.common.Status.WEAK: 30,
+        fuzzycat.common.Status.AMBIGUOUS: 40,
+        fuzzycat.common.Status.DIFFERENT: 60,
+    }
+
+    # TODO: the size here is a first guess; what should it really be?
+    candidates = match_release_fuzzy(release, size=10, es=es_client)
+    if not candidates:
+        return None
+
+    release_dict = entity_to_dict(release, api_client=api_client.api_client)
+    verified = [
+        (
+            fuzzycat.verify.verify(
+                release_dict, entity_to_dict(c, api_client=api_client.api_client)
+            ),
+            c,
+        )
+        for c in candidates
+    ]
+
+    # chose the "closest" match
+    closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
+    if closest[0].status == fuzzycat.common.Status.DIFFERENT:
+        return None
+    elif closest[0].status == fuzzycat.common.Status.TODO:
+        raise NotImplementedError("fuzzycat verify hit a Status.TODO")
+    else:
+        return (closest[0].status.name, closest[0].reason.value, closest[1])
+
+
+if __name__ == "__main__":
+    """
+    Demo showing how to integrate the above functions together.
+    """
+    import os
+    import elasticsearch
+    import fatcat_openapi_client
+
+    citation = sys.argv[1]
+    print("Sending to GROBID...")
+    resp = grobid_process_citation(citation)
+    print(resp)
+    if not resp:
+        sys.exit(0)
+    ref = transform_grobid(resp)
+    print(ref)
+    if not ref:
+        sys.exit(0)
+    release = ref_to_release(ref)
+    print(release)
+
+    es_backend = os.environ.get("ELASTICSEARCH_BACKEND", "https://search.fatcat.wiki")
+    es_client = elasticsearch.Elasticsearch(es_backend)
+    api_conf = fatcat_openapi_client.Configuration()
+    api_conf.host = os.environ.get("FATCAT_API_HOST", "https://api.fatcat.wiki/v0")
+    api_client = fatcat_openapi_client.DefaultApi(
+        fatcat_openapi_client.ApiClient(api_conf)
+    )
+    matches = fuzzy_match(release, es_client=es_client, api_client=api_client)
+    print(matches)
+    if not matches or matches[0] not in ("EXACT", "STRONG", "WEAK"):
+        sys.exit(0)
+    print(matches[2].work_id)
author	Bryan Newbold <bnewbold@archive.org>	2021-01-18 23:49:34 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-01-19 19:49:04 -0800
commit	38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f (patch)
tree	3bbbf2802ec50e71a5d752984cd2aa667d311094 /fatcat_scholar/query_citation.py
parent	b938dc6ba8c354d483d96988dc057565db91ca38 (diff)
download	fatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.tar.gz fatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.zip