From 0c84af603894049dd8edd95da18d8990ab0516d1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 5 Nov 2021 17:19:07 +0100 Subject: turn "match_release_fuzzy" into a class Goal of this refactoring was to make the matching process a bit more configurable by using a class and a cascade of queries. For a limited test set: `FuzzyReleaseMatcher.match` is works the same as `match_release_fuzzy`. --- fuzzycat/matching.py | 559 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 482 insertions(+), 77 deletions(-) (limited to 'fuzzycat/matching.py') diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b358899..1531ac6 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,22 +1,430 @@ +import logging import os import re import sys -from typing import List, Optional, Type, Union +from typing import Any, List, Optional, Type, Union import elasticsearch import elasticsearch_dsl import fatcat_openapi_client import requests -from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity +from fatcat_openapi_client import (ContainerEntity, DefaultApi, ReleaseContrib, ReleaseEntity) from fatcat_openapi_client.rest import ApiException from fuzzycat.config import settings +from fuzzycat.contrib import (ContribListMatcher, FuzzyStringSimilarity, JaccardIndexThreshold, + Pipeline) from fuzzycat.entities import entity_from_dict, entity_from_json from fuzzycat.utils import es_compat_hits_total FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") +class FuzzyReleaseMatcher: + """ + FuzzyReleaseMatcher tries to find similar items to a given release in + elasticsearch. Exact matches first, then fuzzy. + + In the best case, elasticsearch would automatically rank the most relevant + docs first, even with partial data. We still try to steer the matches by + using a query cascade. This is configurable. The last query should be a + generic. + + The goal here is to get a set of potential matches; verification has to. + happen separately. + + TODO: + + Example case not yet working well ("Stuehrenberg" vs "Stührenberg"): + + >>> result = matcher.match(entity_from_dict({"title": "internet archive", + "contribs": [{"raw_name": + "Stührenberg"}], + "ext_ids": {}}, + ReleaseEntity)) + + > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy, + https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1 + (not returning anything via frontend either) + + Make sure we can switch from function to class: + + * [ ] 5 test cases for both + + """ + def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10): + if isinstance(es, str): + self.es = elasticsearch.Elasticsearch([es]) + else: + self.es = es if es else elasticsearch.Elasticsearch() + self.api = api if api else public_api(FATCAT_API_URL) + self.index = index + self.size = size + self.logger = logging.getLogger("fuzzy") + + def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]: + """ + Check for exact matches by identifier. + """ + ext_ids = release.ext_ids + attrs = ( + "doi", + "pmid", + "wikidata_qid", + "core", + "pmcid", + "arxiv", + "dblp", + "doaj", + "jstor", + "isbn13", + "ark", + "mag", + "oai", + ) + for attr in attrs: + value = getattr(ext_ids, attr) + if not value: + continue + try: + r = self.api.lookup_release(**{attr: value}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status in [404, 400]: + r = None + else: + raise err + if r: + return [r] + return [] + + def match_release_exact_title_exact_contrib(self, release): + """ + Match exact title and exact contrib names. Case insensitive, order of + contribs does not matter. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + hits_total = es_compat_hits_total(resp) + if hits_total == 0: + return result + if hits_total > self.size: + self.logger.warn('more than {} hits: {}'.format(self.size, hits_total)) + + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require overlap of contrib. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(1.0), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_partial_contrib(self, release): + """ + Allow for exact authors, but ok, if some are missing. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('result set too large: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require at least half the contribs to be shared. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_fuzzy_contrib(self, release): + """ + Exact title but ok it authors differ (slightly). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + matcher = ContribListMatcher( + cmp=FuzzyStringSimilarity(min_ratio=60), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title(self, release): + """ + Exact title, but any author. For common titles, this will yield 100s or + 1000s or results. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + result.append(re) + return result + + def match_release_fuzzy_title_fuzzy_contrib(self, release): + """ + Using elasticsearch fuzziness option (which is not that fuzzy). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + ] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic(self, release): + """ + Final catch all variant via title. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "OR", + "fuzziness": "AUTO", + }, + } + }, + ], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic_fuzzy_contrib(self, release): + """ + Only match contribs, if they exist. + """ + if release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_cascade(self, release, *qs, **kwargs): + """ + Returns the result from the first query that returns a result. All query + functions need to be defined on this class (for now). + """ + for q in qs: + self.logger.debug("[cascade] {}".format(q)) + result = q(release, **kwargs) + if len(result) > 0: + return result + return [] + + def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: + """ + Match returns a list of match candidates given a release entity. + """ + if not release: + return [] + return self.match_cascade( + release, self.match_release_by_id, self.match_release_exact_title_exact_contrib, + self.match_release_exact_title_partial_contrib, + self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title, + self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic, + self.match_release_generic_fuzzy_contrib) + + def match_release_fuzzy( release: ReleaseEntity, size: int = 5, @@ -28,6 +436,8 @@ def match_release_fuzzy( fatcat using Elasticsearch. TODO: rename "es" parameter to "es_client", which would be clearer + + This is deprecated, move to matcher class. """ assert isinstance(release, ReleaseEntity) @@ -41,6 +451,17 @@ def match_release_fuzzy( if api is None: api = public_api(FATCAT_API_URL) + # > query cascade + # + # [x] 1 exact ids + # [ ] 2 exact title and exact contrib + # [ ] 3 exact title and fuzzy contrib + # [ ] 4 exact title + # [ ] 5 title w/o stopwords, fuzzy contrib + # [ ] 6 title w/o stopwords + # [ ] 7 fuzzy title and fuzzy contrib + # [ ] 8 fuzzy whole document + # Try to match by external identifier. # TODO: use api, ability to disable; benchmark ext_ids = release.ext_ids @@ -75,105 +496,89 @@ def match_release_fuzzy( if release.title is not None and release.contribs is not None: names = " ".join([c.raw_name for c in release.contribs]) - body = { - "track_total_hits": True, - "query": { - "bool": { - "must": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - body = { - "track_total_hits": True, - "query": { - "bool": { - "should": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, + query = { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Note: If the title is short, we will get lots of results here; do we need # to check for title length or result set length length or result set # length here? - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) -- cgit v1.2.3 From 409392d66c3a6debe5bc69c0e2308209ac74ee35 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 16 Nov 2021 20:02:20 +0100 Subject: use elasticsearch <7.14 search args --- fuzzycat/matching.py | 58 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 11 deletions(-) (limited to 'fuzzycat/matching.py') diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 1531ac6..c83e48c 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -126,7 +126,12 @@ class FuzzyReleaseMatcher: } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) hits_total = es_compat_hits_total(resp) if hits_total == 0: return result @@ -181,7 +186,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -234,7 +244,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -279,7 +294,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(body={ + "query": query, + "size": self.size, + "track_total_hits": True + }, + index=self.index) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -324,7 +344,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -357,7 +382,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -388,7 +418,12 @@ class FuzzyReleaseMatcher: }, } result = [] - resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + resp = self.es.search(index=self.index, + body={ + "query": query, + "size": self.size, + "track_total_hits": True + }) if es_compat_hits_total(resp) == 0: return result if es_compat_hits_total(resp) > self.size: @@ -430,6 +465,7 @@ def match_release_fuzzy( size: int = 5, es: Optional[Union[str, Type[elasticsearch.client.Elasticsearch]]] = None, api: DefaultApi = None, + index: str = "fatcat_release", ) -> List[ReleaseEntity]: """ Given a release entity, return a number similar release entities from @@ -520,7 +556,7 @@ def match_release_fuzzy( ], }, } - resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) + resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) @@ -548,7 +584,7 @@ def match_release_fuzzy( ], }, } - resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) + resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) @@ -563,7 +599,7 @@ def match_release_fuzzy( } } } - resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) + resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) @@ -578,7 +614,7 @@ def match_release_fuzzy( } } } - resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) + resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True}) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) -- cgit v1.2.3