From dccbaa5c1b0ba556449de6024540ba05d67ef6a0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 21 Sep 2021 15:55:52 +0200 Subject: matching: run an additional es query for fuzzy matching --- fuzzycat/matching.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/test_matching.py | 22 +++++++++++++-- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 33e130e..310dfc2 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -73,12 +73,83 @@ def match_release_fuzzy( if r: return [r] + + if release.title is not None and release.contribs is not None: + names = " ".join([c.raw_name for c in release.contribs]) + body = { + "track_total_hits": True, + "query": { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + body = { + "track_total_hits": True, + "query": { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + # Note: If the title is short, we will get lots of results here; do we need + # to check for title length or result set length length or result set + # length here? body = { + "track_total_hits": True, "query": { "match": { "title": { "query": release.title, - "operator": "AND" + "operator": "AND", } } }, @@ -91,6 +162,7 @@ def match_release_fuzzy( # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness body = { + "track_total_hits": True, "query": { "match": { "title": { diff --git a/tests/test_matching.py b/tests/test_matching.py index 2122144..c2e26f3 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -71,6 +71,24 @@ def test_match_release_fuzzy(es_client, caplog): "title": "digital libraries", "ext_ids": {} }, 5), + ({ + "title": "unlikelytitle", + "ext_ids": {} + }, 0), + ({ + "title": "Imminent dystopia", + "ext_ids": {} + }, 2), + ({ + "title": "", + "contribs": [{"raw_name": "Aristoteles"}], + "ext_ids": {} + }, 5), + ({ + "title": "Letter", + "contribs": [{"raw_name": "Claudel"}], + "ext_ids": {} + }, 1), ({ "title": "The Future of Digital Scholarship", "contribs": [{ @@ -83,6 +101,6 @@ def test_match_release_fuzzy(es_client, caplog): entity = entity_from_dict(doc, ReleaseEntity) result = match_release_fuzzy(entity, es=es_client) with caplog.at_level(logging.INFO): - logging.info("[{}] given {}, found {}, {}".format(i, entity.title, len(result), + logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), [v.title for v in result])) - assert len(result) == count + assert len(result) == count, doc -- cgit v1.2.3