diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-09-21 15:55:52 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-21 15:55:52 +0200 |
commit | dccbaa5c1b0ba556449de6024540ba05d67ef6a0 (patch) | |
tree | 60876376084fbeb1ec541079bc458d9f7858370d /fuzzycat | |
parent | 6a224c316869ba2651094ad47e1d92e102524f85 (diff) | |
download | fuzzycat-dccbaa5c1b0ba556449de6024540ba05d67ef6a0.tar.gz fuzzycat-dccbaa5c1b0ba556449de6024540ba05d67ef6a0.zip |
matching: run an additional es query for fuzzy matching
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/matching.py | 74 |
1 files changed, 73 insertions, 1 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 33e130e..310dfc2 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -73,12 +73,83 @@ def match_release_fuzzy( if r: return [r] + + if release.title is not None and release.contribs is not None: + names = " ".join([c.raw_name for c in release.contribs]) + body = { + "track_total_hits": True, + "query": { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + body = { + "track_total_hits": True, + "query": { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + ], + }, + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if es_compat_hits_total(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) + + # Note: If the title is short, we will get lots of results here; do we need + # to check for title length or result set length length or result set + # length here? body = { + "track_total_hits": True, "query": { "match": { "title": { "query": release.title, - "operator": "AND" + "operator": "AND", } } }, @@ -91,6 +162,7 @@ def match_release_fuzzy( # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness body = { + "track_total_hits": True, "query": { "match": { "title": { |