diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 00:55:05 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 19:49:04 -0800 |
commit | 59cc64a24e4331899e6b952cc7a8dedc1ec13547 (patch) | |
tree | f7ff589b279cbffbd4a142b736e5ef032ffef1d3 /fatcat_scholar/query_citation.py | |
parent | 9cc666cd7baba1dc2bfb9e553a2c021fbeb61c24 (diff) | |
download | fatcat-scholar-59cc64a24e4331899e6b952cc7a8dedc1ec13547.tar.gz fatcat-scholar-59cc64a24e4331899e6b952cc7a8dedc1ec13547.zip |
refactor citation lookups; add high-level helper
Diffstat (limited to 'fatcat_scholar/query_citation.py')
-rw-r--r-- | fatcat_scholar/query_citation.py | 37 |
1 files changed, 32 insertions, 5 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py index 970c235..c555fd4 100644 --- a/fatcat_scholar/query_citation.py +++ b/fatcat_scholar/query_citation.py @@ -25,13 +25,14 @@ import fuzzycat.verify from fatcat_scholar.grobid2json import biblio_info -def grobid_process_citation(raw: str) -> Optional[str]: - GROBID_URL = "https://grobid.qa.fatcat.wiki" +def grobid_process_citation( + raw: str, grobid_host: str = "https://grobid.qa.fatcat.wiki", timeout: float = 10.0 +) -> Optional[str]: try: grobid_response = requests.post( - GROBID_URL + "/api/processCitation", + grobid_host + "/api/processCitation", data={"citations": raw, "consolidateCitations": 0,}, - timeout=180.0, + timeout=timeout, ) except requests.Timeout: print("GROBID request (HTTP POST) timeout", file=sys.stderr) @@ -48,6 +49,8 @@ def transform_grobid(raw_xml: str) -> Optional[dict]: tree = ET.parse(io.StringIO(raw_xml)) root = tree.getroot() ref = biblio_info(root, ns="") + if not any(ref.values()): + return None return ref @@ -83,7 +86,7 @@ def ref_to_release(ref: dict) -> ReleaseEntity: def fuzzy_match( - release: ReleaseEntity, es_client: Any, api_client: Any + release: ReleaseEntity, es_client: Any, api_client: Any, timeout: float = 10.0 ) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for @@ -102,6 +105,8 @@ def fuzzy_match( Eg, if there is any EXACT match that is always returned; an AMBIGIOUS result is only returned if all the candidate matches were ambiguous. + + TODO: actually do something with timeout """ # this map used to establish priority order of verified matches @@ -140,6 +145,28 @@ def fuzzy_match( return (closest[0].status.name, closest[0].reason.value, closest[1]) +def try_fuzzy_match( + citation: str, grobid_host: str, es_client: Any, fatcat_api_client: Any +) -> Optional[str]: + """ + All-in-one helper method + """ + resp = grobid_process_citation(citation, grobid_host=grobid_host, timeout=3.0) + if not resp: + return None + ref = transform_grobid(resp) + if not ref: + return None + release = ref_to_release(ref) + + matches = fuzzy_match( + release, es_client=es_client, api_client=fatcat_api_client, timeout=3.0 + ) + if not matches or matches[0] not in ("EXACT", "STRONG", "WEAK"): + return None + return f"work_{matches[2].work_id}" + + if __name__ == "__main__": """ Demo showing how to integrate the above functions together. |