From d104f8d0ba8eef5563555de82be66bbf17f961db Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 16 Nov 2021 21:13:46 +0100 Subject: complete migration from away from match_release_fuzzy Instead, use `FuzzyReleaseMatcher.match`, which has approximately the same behavior. --- fuzzycat/__main__.py | 5 +- fuzzycat/matching.py | 162 ------------------------------------------------- fuzzycat/simple.py | 5 +- tests/test_matching.py | 82 +------------------------ 4 files changed, 7 insertions(+), 247 deletions(-) diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 7792df6..d616efc 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -60,7 +60,7 @@ from fatcat_openapi_client import ReleaseEntity from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured -from fuzzycat.matching import anything_to_entity, match_release_fuzzy +from fuzzycat.matching import FuzzyReleaseMatcher, anything_to_entity from fuzzycat.refs import RefsGroupVerifier from fuzzycat.simple import closest_fuzzy_release_match from fuzzycat.utils import random_idents_from_query, random_word @@ -143,7 +143,8 @@ def run_release_match(args): """ try: entity = anything_to_entity(args.value, ReleaseEntity) - result = match_release_fuzzy(entity, size=args.size, es=args.es_url) + matcher = FuzzyReleaseMatcher(es=args.es_url, size=args.size) + result = matcher.match(entity) except Exception as err: print("fuzzy match failed: {}".format(err), file=sys.stderr) else: diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index c83e48c..2984d9a 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -460,168 +460,6 @@ class FuzzyReleaseMatcher: self.match_release_generic_fuzzy_contrib) -def match_release_fuzzy( - release: ReleaseEntity, - size: int = 5, - es: Optional[Union[str, Type[elasticsearch.client.Elasticsearch]]] = None, - api: DefaultApi = None, - index: str = "fatcat_release", -) -> List[ReleaseEntity]: - """ - Given a release entity, return a number similar release entities from - fatcat using Elasticsearch. - - TODO: rename "es" parameter to "es_client", which would be clearer - - This is deprecated, move to matcher class. - """ - assert isinstance(release, ReleaseEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - if api is None: - api = public_api(FATCAT_API_URL) - - # > query cascade - # - # [x] 1 exact ids - # [ ] 2 exact title and exact contrib - # [ ] 3 exact title and fuzzy contrib - # [ ] 4 exact title - # [ ] 5 title w/o stopwords, fuzzy contrib - # [ ] 6 title w/o stopwords - # [ ] 7 fuzzy title and fuzzy contrib - # [ ] 8 fuzzy whole document - - # Try to match by external identifier. - # TODO: use api, ability to disable; benchmark - ext_ids = release.ext_ids - attrs = ( - "doi", - "wikidata_qid", - "isbn13", - "pmid", - "pmcid", - "core", - "arxiv", - "jstor", - "ark", - "mag", - "doaj", - "dblp", - "oai", - ) - for attr in attrs: - value = getattr(ext_ids, attr) - if not value: - continue - try: - r = api.lookup_release(**{attr: value}) - except fatcat_openapi_client.rest.ApiException as err: - if err.status in [404, 400]: - r = None - else: - raise err - if r: - return [r] - - if release.title is not None and release.contribs is not None: - names = " ".join([c.raw_name for c in release.contribs]) - query = { - "bool": { - "must": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - query = { - "bool": { - "should": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # Note: If the title is short, we will get lots of results here; do we need - # to check for title length or result set length length or result set - # length here? - query = { - "match": { - "title": { - "query": release.title, - "operator": "AND", - } - } - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - query = { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } - } - } - resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) - if es_compat_hits_total(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - - # TODO: perform more queries on other fields. - return [] - - def public_api(host_uri): """ Note: unlike the authenticated variant, this helper might get called even diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py index ff59ba2..c92b5ae 100644 --- a/fuzzycat/simple.py +++ b/fuzzycat/simple.py @@ -24,7 +24,7 @@ from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds from fuzzycat.common import Reason, Status from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured -from fuzzycat.matching import match_release_fuzzy +from fuzzycat.matching import FuzzyReleaseMatcher from fuzzycat.utils import clean_doi from fuzzycat.verify import verify @@ -84,7 +84,8 @@ def close_fuzzy_release_matches(release: ReleaseEntity, result is only returned if all the candidate matches were ambiguous. """ - candidates = match_release_fuzzy(release, size=match_limit, es=es_client) + matcher = FuzzyReleaseMatcher(es=es_client, size=match_limit) + candidates = matcher.match(release) if not candidates: return None diff --git a/tests/test_matching.py b/tests/test_matching.py index ca94c2a..a7754ee 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -8,12 +8,11 @@ import requests from fatcat_openapi_client import ReleaseEntity, ReleaseContrib from fuzzycat.entities import entity_from_dict, entity_from_json -from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher +from fuzzycat.matching import anything_to_entity, FuzzyReleaseMatcher warnings.filterwarnings( "ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... -from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.config import settings from fatcat_openapi_client import ReleaseEntity import pytest @@ -30,20 +29,6 @@ logger.setLevel(logging.DEBUG) FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443") -def is_not_reachable(url, timeout=3): - return not is_reachable(url) - - -def is_reachable(url, timeout=3): - """ - Return true, if URL is reachable and returns HTTP 200. - """ - try: - return requests.get(url, verify=False, timeout=timeout).ok - except Exception: - return False - - def yaml_to_cases(klass, files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"): """ @@ -63,71 +48,6 @@ def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -# @pytest.mark.skipif( -# is_not_reachable(FATCAT_SEARCH_URL), -# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". -# format(FATCAT_SEARCH_URL)) -def test_match_release_fuzzy(es_client, caplog): - """ - This test is tied to the current index contents, so if that changes, this - test may fail as well. - - Note: Deprecated. We want to get rid of this. - """ - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - - result = match_release_fuzzy(entity, es=es_client) - logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) - assert len(result) == count - - # Partial data. - cases = ( - ({ - "title": "digital libraries", - "ext_ids": {} - }, 5), - ({ - "title": "unlikelytitle", - "ext_ids": {} - }, 0), - ({ - "title": "Imminent dystopia", - "ext_ids": {} - }, 2), - ({ - "title": "", - "contribs": [{ - "raw_name": "Aristoteles" - }], - "ext_ids": {} - }, 5), - # ({ - # "title": "Letter", - # "contribs": [{"raw_name": "Claudel"}], - # "ext_ids": {} - # }, 1), - # ({ - # "title": "The Future of Digital Scholarship", - # "contribs": [{ - # "raw_name": "Costantino Thanos" - # }], - # "ext_ids": {} - # }, 5), - ) - for i, (doc, count) in enumerate(cases): - entity = entity_from_dict(doc, ReleaseEntity) - result = match_release_fuzzy(entity, es=es_client) - with caplog.at_level(logging.INFO): - logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) - assert len(result) == count, doc - - def test_matcher_match_release(es_client, caplog): cases = ( ("wtv64ahbdzgwnan7rllwr3nurm", 1), -- cgit v1.2.3