aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/query_citation.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-18 23:49:34 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-19 19:49:04 -0800
commit38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f (patch)
tree3bbbf2802ec50e71a5d752984cd2aa667d311094 /fatcat_scholar/query_citation.py
parentb938dc6ba8c354d483d96988dc057565db91ca38 (diff)
downloadfatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.tar.gz
fatcat-scholar-38665ebca0c6ff9fa3c9fd2c4d421c71055d8f2f.zip
citation querying helpers (GROBID, fuzzycat, etc)
Diffstat (limited to 'fatcat_scholar/query_citation.py')
-rw-r--r--fatcat_scholar/query_citation.py175
1 files changed, 175 insertions, 0 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py
new file mode 100644
index 0000000..970c235
--- /dev/null
+++ b/fatcat_scholar/query_citation.py
@@ -0,0 +1,175 @@
+"""
+This file contains helpers to fuzzy match a raw citation string:
+
+- try to parse it with GROBID into structured form
+- transform the GROBID XML response to a simple dict/struct
+- run fuzzycat lookup
+
+Note that this chain hits several external services, and should be wrapped in a
+timeout and try/except! In the future, perhaps should be async so it can run in
+parallel with "regular" query?
+"""
+
+import io
+import sys
+from typing import Optional, Any, Tuple
+import xml.etree.ElementTree as ET
+
+import requests
+from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds, ReleaseContrib
+from fatcat_scholar.api_entities import entity_to_dict
+from fuzzycat.matching import match_release_fuzzy
+import fuzzycat.common
+import fuzzycat.verify
+
+from fatcat_scholar.grobid2json import biblio_info
+
+
+def grobid_process_citation(raw: str) -> Optional[str]:
+ GROBID_URL = "https://grobid.qa.fatcat.wiki"
+ try:
+ grobid_response = requests.post(
+ GROBID_URL + "/api/processCitation",
+ data={"citations": raw, "consolidateCitations": 0,},
+ timeout=180.0,
+ )
+ except requests.Timeout:
+ print("GROBID request (HTTP POST) timeout", file=sys.stderr)
+ return None
+ if grobid_response.status_code != 200:
+ print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
+ return None
+ return grobid_response.text
+
+
+def transform_grobid(raw_xml: str) -> Optional[dict]:
+ # first, remove any xmlns stuff
+ raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+ tree = ET.parse(io.StringIO(raw_xml))
+ root = tree.getroot()
+ ref = biblio_info(root, ns="")
+ return ref
+
+
+def ref_to_release(ref: dict) -> ReleaseEntity:
+ contribs = []
+ for author in ref.get("authors") or []:
+ contribs.append(
+ ReleaseContrib(
+ raw_name=author.get("name"),
+ given_name=author.get("given_name"),
+ surname=author.get("surname"),
+ )
+ )
+ release = ReleaseEntity(
+ title=ref.get("title"),
+ contribs=contribs,
+ volume=ref.get("volume"),
+ issue=ref.get("issue"),
+ pages=ref.get("pages"),
+ ext_ids=ReleaseExtIds(
+ doi=ref.get("doi"),
+ pmid=ref.get("pmid"),
+ pmcid=ref.get("pmcid"),
+ arxiv=ref.get("arxiv_id"),
+ ),
+ )
+ if ref.get("journal"):
+ release.extra = {"container_name": ref.get("journal")}
+ if ref.get("date"):
+ if len(ref["date"]) == 4 and ref["date"].isdigit():
+ release.release_year = int(ref["date"])
+ return release
+
+
+def fuzzy_match(
+ release: ReleaseEntity, es_client: Any, api_client: Any
+) -> Optional[Tuple[str, str, ReleaseEntity]]:
+ """
+ This helper function uses fuzzycat (and elasticsearch) to look for
+ existing release entities with similar metadata.
+
+ Returns None if there was no match of any kind, or a single tuple
+ (status: str, reason: str, existing: ReleaseEntity) if there was a match.
+
+ Status string is one of the fuzzycat.common.Status, with "strongest
+ match" in this sorted order:
+
+ - EXACT
+ - STRONG
+ - WEAK
+ - AMBIGUOUS
+
+ Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
+ result is only returned if all the candidate matches were ambiguous.
+ """
+
+ # this map used to establish priority order of verified matches
+ STATUS_SORT = {
+ fuzzycat.common.Status.TODO: 0,
+ fuzzycat.common.Status.EXACT: 10,
+ fuzzycat.common.Status.STRONG: 20,
+ fuzzycat.common.Status.WEAK: 30,
+ fuzzycat.common.Status.AMBIGUOUS: 40,
+ fuzzycat.common.Status.DIFFERENT: 60,
+ }
+
+ # TODO: the size here is a first guess; what should it really be?
+ candidates = match_release_fuzzy(release, size=10, es=es_client)
+ if not candidates:
+ return None
+
+ release_dict = entity_to_dict(release, api_client=api_client.api_client)
+ verified = [
+ (
+ fuzzycat.verify.verify(
+ release_dict, entity_to_dict(c, api_client=api_client.api_client)
+ ),
+ c,
+ )
+ for c in candidates
+ ]
+
+ # chose the "closest" match
+ closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
+ if closest[0].status == fuzzycat.common.Status.DIFFERENT:
+ return None
+ elif closest[0].status == fuzzycat.common.Status.TODO:
+ raise NotImplementedError("fuzzycat verify hit a Status.TODO")
+ else:
+ return (closest[0].status.name, closest[0].reason.value, closest[1])
+
+
+if __name__ == "__main__":
+ """
+ Demo showing how to integrate the above functions together.
+ """
+ import os
+ import elasticsearch
+ import fatcat_openapi_client
+
+ citation = sys.argv[1]
+ print("Sending to GROBID...")
+ resp = grobid_process_citation(citation)
+ print(resp)
+ if not resp:
+ sys.exit(0)
+ ref = transform_grobid(resp)
+ print(ref)
+ if not ref:
+ sys.exit(0)
+ release = ref_to_release(ref)
+ print(release)
+
+ es_backend = os.environ.get("ELASTICSEARCH_BACKEND", "https://search.fatcat.wiki")
+ es_client = elasticsearch.Elasticsearch(es_backend)
+ api_conf = fatcat_openapi_client.Configuration()
+ api_conf.host = os.environ.get("FATCAT_API_HOST", "https://api.fatcat.wiki/v0")
+ api_client = fatcat_openapi_client.DefaultApi(
+ fatcat_openapi_client.ApiClient(api_conf)
+ )
+ matches = fuzzy_match(release, es_client=es_client, api_client=api_client)
+ print(matches)
+ if not matches or matches[0] not in ("EXACT", "STRONG", "WEAK"):
+ sys.exit(0)
+ print(matches[2].work_id)