From d104f8d0ba8eef5563555de82be66bbf17f961db Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 16 Nov 2021 21:13:46 +0100 Subject: complete migration from away from match_release_fuzzy Instead, use `FuzzyReleaseMatcher.match`, which has approximately the same behavior. --- fuzzycat/__main__.py | 5 +- fuzzycat/matching.py | 162 --------------------------------------------------- fuzzycat/simple.py | 5 +- 3 files changed, 6 insertions(+), 166 deletions(-) (limited to 'fuzzycat') diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 7792df6..d616efc 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -60,7 +60,7 @@ from fatcat_openapi_client import ReleaseEntity from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured -from fuzzycat.matching import anything_to_entity, match_release_fuzzy +from fuzzycat.matching import FuzzyReleaseMatcher, anything_to_entity from fuzzycat.refs import RefsGroupVerifier from fuzzycat.simple import closest_fuzzy_release_match from fuzzycat.utils import random_idents_from_query, random_word @@ -143,7 +143,8 @@ def run_release_match(args): """ try: entity = anything_to_entity(args.value, ReleaseEntity) - result = match_release_fuzzy(entity, size=args.size, es=args.es_url) + matcher = FuzzyReleaseMatcher(es=args.es_url, size=args.size) + result = matcher.match(entity) except Exception as err: print("fuzzy match failed: {}".format(err), file=sys.stderr) else: diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index c83e48c..2984d9a 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -460,168 +460,6 @@ class FuzzyReleaseMatcher: self.match_release_generic_fuzzy_contrib) -def match_release_fuzzy( - release: ReleaseEntity, - size: int = 5, - es: Optional[Union[str, Type[elasticsearch.client.Elasticsearch]]] = None, - api: DefaultApi = None, - index: str = "fatcat_release", -) -> List[ReleaseEntity]: - """ - Given a release entity, return a number similar release entities from - fatcat using Elasticsearch. - - TODO: rename "es" parameter to "es_client", which would be clearer - - This is deprecated, move to matcher class. - """ - assert isinstance(release, ReleaseEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - if api is None: - api = public_api(FATCAT_API_URL) - - # > query cascade - # - # [x] 1 exact ids - # [ ] 2 exact title and exact contrib - # [ ] 3 exact title and fuzzy contrib - # [ ] 4 exact title - # [ ] 5 title w/o stopwords, fuzzy contrib - # [ ] 6 title w/o stopwords - # [ ] 7 fuzzy title and fuzzy contrib - # [ ] 8 fuzzy whole document - - # Try to match by external identifier. - # TODO: use api, ability to disable; benchmark - ext_ids = release.ext_ids - attrs = ( - "doi", - "wikidata_qid", - "isbn13", - "pmid", - "pmcid", - "core", - "arxiv", - "jstor", - "ark", - "mag", - "doaj", - "dblp", - "oai", - ) - for attr in attrs: - value = getattr(ext_ids, attr) - if not value: - continue - try: - r = api.lookup_release(**{attr: value}) - except fatcat_openapi_client.rest.ApiException as err: - if err.status in [404, 400]: - r = None - else: - raise err - if r: - return [r] - - if release.title is not None and release.contribs is not None: - names = " ".join([c.raw_name for c in release.contribs]) - query = { - "bool": { - "must": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - query = { - "bool": { - "should": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # Note: If the title is short, we will get lots of results here; do we need - # to check for title length or result set length length or result set - # length here? - query = { - "match": { - "title": { - "query": release.title, - "operator": "AND", - } - } - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - query = { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } - } - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # TODO: perform more queries on other fields. - return [] - - def public_api(host_uri): """ Note: unlike the authenticated variant, this helper might get called even diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py index ff59ba2..c92b5ae 100644 --- a/fuzzycat/simple.py +++ b/fuzzycat/simple.py @@ -24,7 +24,7 @@ from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds from fuzzycat.common import Reason, Status from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured -from fuzzycat.matching import match_release_fuzzy +from fuzzycat.matching import FuzzyReleaseMatcher from fuzzycat.utils import clean_doi from fuzzycat.verify import verify @@ -84,7 +84,8 @@ def close_fuzzy_release_matches(release: ReleaseEntity, result is only returned if all the candidate matches were ambiguous. """ - candidates = match_release_fuzzy(release, size=match_limit, es=es_client) + matcher = FuzzyReleaseMatcher(es=es_client, size=match_limit) + candidates = matcher.match(release) if not candidates: return None -- cgit v1.2.3