From 08a9242e2ed19aaec14d92fe174bee21bb4232eb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 21 Sep 2021 15:54:46 +0200 Subject: style: apply formatting --- tests/test_matching.py | 15 ++++++++++++--- tests/test_utils.py | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'tests') diff --git a/tests/test_matching.py b/tests/test_matching.py index 7ab7b11..2122144 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -9,7 +9,8 @@ from fatcat_openapi_client import ReleaseEntity from fuzzycat.entities import entity_from_dict from fuzzycat.matching import anything_to_entity, match_release_fuzzy -warnings.filterwarnings("ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... +warnings.filterwarnings( + "ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.config import settings @@ -28,6 +29,7 @@ FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wik def is_not_reachable(url, timeout=3): return not is_reachable(url) + def is_reachable(url, timeout=3): """ Return true, if URL is reachable and returns HTTP 200. @@ -37,14 +39,21 @@ def is_reachable(url, timeout=3): except Exception: return False + @pytest.fixture def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -@pytest.mark.skipif(is_not_reachable(FATCAT_SEARCH_URL), - reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".format(FATCAT_SEARCH_URL)) +@pytest.mark.skipif( + is_not_reachable(FATCAT_SEARCH_URL), + reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". + format(FATCAT_SEARCH_URL)) def test_match_release_fuzzy(es_client, caplog): + """ + This test is tied to the current index contents, so if that changes, this + test may fail as well. + """ cases = ( ("wtv64ahbdzgwnan7rllwr3nurm", 1), ("eqcgtpav3na5jh56o5vjsvb4ei", 1), diff --git a/tests/test_utils.py b/tests/test_utils.py index 21b85a4..957203f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -124,6 +124,7 @@ def test_es_compat_hits_total(): for r, expected in cases: assert es_compat_hits_total(r) == expected + def test_clean_doi(): assert clean_doi(None) == None assert clean_doi("blah") == None -- cgit v1.2.3 From dccbaa5c1b0ba556449de6024540ba05d67ef6a0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 21 Sep 2021 15:55:52 +0200 Subject: matching: run an additional es query for fuzzy matching --- fuzzycat/matching.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/test_matching.py | 22 +++++++++++++-- 2 files changed, 93 insertions(+), 3 deletions(-) (limited to 'tests') diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 33e130e..310dfc2 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -73,12 +73,83 @@ def match_release_fuzzy( if r: return [r] + + if release.title is not None and release.contribs is not None: + names = " ".join([c.raw_name for c in release.contribs]) + body = { + "track_total_hits": True, + "query": { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + body = { + "track_total_hits": True, + "query": { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + # Note: If the title is short, we will get lots of results here; do we need + # to check for title length or result set length length or result set + # length here? body = { + "track_total_hits": True, "query": { "match": { "title": { "query": release.title, - "operator": "AND" + "operator": "AND", } } }, @@ -91,6 +162,7 @@ def match_release_fuzzy( # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness body = { + "track_total_hits": True, "query": { "match": { "title": { diff --git a/tests/test_matching.py b/tests/test_matching.py index 2122144..c2e26f3 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -71,6 +71,24 @@ def test_match_release_fuzzy(es_client, caplog): "title": "digital libraries", "ext_ids": {} }, 5), + ({ + "title": "unlikelytitle", + "ext_ids": {} + }, 0), + ({ + "title": "Imminent dystopia", + "ext_ids": {} + }, 2), + ({ + "title": "", + "contribs": [{"raw_name": "Aristoteles"}], + "ext_ids": {} + }, 5), + ({ + "title": "Letter", + "contribs": [{"raw_name": "Claudel"}], + "ext_ids": {} + }, 1), ({ "title": "The Future of Digital Scholarship", "contribs": [{ @@ -83,6 +101,6 @@ def test_match_release_fuzzy(es_client, caplog): entity = entity_from_dict(doc, ReleaseEntity) result = match_release_fuzzy(entity, es=es_client) with caplog.at_level(logging.INFO): - logging.info("[{}] given {}, found {}, {}".format(i, entity.title, len(result), + logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), [v.title for v in result])) - assert len(result) == count + assert len(result) == count, doc -- cgit v1.2.3 From 5fa61d89320af880d5bf6b3231f6478887cfb6a6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 21 Sep 2021 16:36:55 +0200 Subject: tests: temporarily disable tests We want to first move to elasticsearch dsl and will reactivate and extends after refactoring. --- tests/test_matching.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'tests') diff --git a/tests/test_matching.py b/tests/test_matching.py index c2e26f3..90d1fee 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -84,18 +84,18 @@ def test_match_release_fuzzy(es_client, caplog): "contribs": [{"raw_name": "Aristoteles"}], "ext_ids": {} }, 5), - ({ - "title": "Letter", - "contribs": [{"raw_name": "Claudel"}], - "ext_ids": {} - }, 1), - ({ - "title": "The Future of Digital Scholarship", - "contribs": [{ - "raw_name": "Costantino Thanos" - }], - "ext_ids": {} - }, 5), + # ({ + # "title": "Letter", + # "contribs": [{"raw_name": "Claudel"}], + # "ext_ids": {} + # }, 1), + # ({ + # "title": "The Future of Digital Scholarship", + # "contribs": [{ + # "raw_name": "Costantino Thanos" + # }], + # "ext_ids": {} + # }, 5), ) for i, (doc, count) in enumerate(cases): entity = entity_from_dict(doc, ReleaseEntity) -- cgit v1.2.3