aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/matching.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r--fuzzycat/matching.py559
1 files changed, 482 insertions, 77 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b358899..1531ac6 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,22 +1,430 @@
+import logging
import os
import re
import sys
-from typing import List, Optional, Type, Union
+from typing import Any, List, Optional, Type, Union
import elasticsearch
import elasticsearch_dsl
import fatcat_openapi_client
import requests
-from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity
+from fatcat_openapi_client import (ContainerEntity, DefaultApi, ReleaseContrib, ReleaseEntity)
from fatcat_openapi_client.rest import ApiException
from fuzzycat.config import settings
+from fuzzycat.contrib import (ContribListMatcher, FuzzyStringSimilarity, JaccardIndexThreshold,
+ Pipeline)
from fuzzycat.entities import entity_from_dict, entity_from_json
from fuzzycat.utils import es_compat_hits_total
FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0")
+class FuzzyReleaseMatcher:
+ """
+ FuzzyReleaseMatcher tries to find similar items to a given release in
+ elasticsearch. Exact matches first, then fuzzy.
+
+ In the best case, elasticsearch would automatically rank the most relevant
+ docs first, even with partial data. We still try to steer the matches by
+ using a query cascade. This is configurable. The last query should be a
+ generic.
+
+ The goal here is to get a set of potential matches; verification has to.
+ happen separately.
+
+ TODO:
+
+ Example case not yet working well ("Stuehrenberg" vs "Stührenberg"):
+
+ >>> result = matcher.match(entity_from_dict({"title": "internet archive",
+ "contribs": [{"raw_name":
+ "Stührenberg"}],
+ "ext_ids": {}},
+ ReleaseEntity))
+
+ > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy,
+ https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1
+ (not returning anything via frontend either)
+
+ Make sure we can switch from function to class:
+
+ * [ ] 5 test cases for both
+
+ """
+ def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10):
+ if isinstance(es, str):
+ self.es = elasticsearch.Elasticsearch([es])
+ else:
+ self.es = es if es else elasticsearch.Elasticsearch()
+ self.api = api if api else public_api(FATCAT_API_URL)
+ self.index = index
+ self.size = size
+ self.logger = logging.getLogger("fuzzy")
+
+ def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]:
+ """
+ Check for exact matches by identifier.
+ """
+ ext_ids = release.ext_ids
+ attrs = (
+ "doi",
+ "pmid",
+ "wikidata_qid",
+ "core",
+ "pmcid",
+ "arxiv",
+ "dblp",
+ "doaj",
+ "jstor",
+ "isbn13",
+ "ark",
+ "mag",
+ "oai",
+ )
+ for attr in attrs:
+ value = getattr(ext_ids, attr)
+ if not value:
+ continue
+ try:
+ r = self.api.lookup_release(**{attr: value})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status in [404, 400]:
+ r = None
+ else:
+ raise err
+ if r:
+ return [r]
+ return []
+
+ def match_release_exact_title_exact_contrib(self, release):
+ """
+ Match exact title and exact contrib names. Case insensitive, order of
+ contribs does not matter.
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": contrib.raw_name,
+ "operator": "AND",
+ }
+ }
+ } for contrib in release.contribs]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ hits_total = es_compat_hits_total(resp)
+ if hits_total == 0:
+ return result
+ if hits_total > self.size:
+ self.logger.warn('more than {} hits: {}'.format(self.size, hits_total))
+
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ # Require overlap of contrib.
+ matcher = ContribListMatcher(
+ cmp=JaccardIndexThreshold(1.0),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title_partial_contrib(self, release):
+ """
+ Allow for exact authors, but ok, if some are missing.
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": contrib.raw_name,
+ "operator": "AND",
+ }
+ }
+ } for contrib in release.contribs]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise NotImplementedError('result set too large: {}'.format(es))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ # Require at least half the contribs to be shared.
+ matcher = ContribListMatcher(
+ cmp=JaccardIndexThreshold(0.5),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title_fuzzy_contrib(self, release):
+ """
+ Exact title but ok it authors differ (slightly).
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+
+ matcher = ContribListMatcher(
+ cmp=FuzzyStringSimilarity(min_ratio=60),
+ pipeline=Pipeline([
+ lambda contribs: set((c.raw_name.strip().lower() for c in contribs)),
+ ]),
+ )
+
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ if not matcher.compare(re.contribs, release.contribs):
+ continue
+ result.append(re)
+ return result
+
+ def match_release_exact_title(self, release):
+ """
+ Exact title, but any author. For common titles, this will yield 100s or
+ 1000s or results.
+ """
+ if release.title is None:
+ return []
+ query = {
+ "bool": {
+ "must": [{
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ },
+ }
+ }],
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ for re in entities:
+ if re.title.strip().lower() != release.title.strip().lower():
+ continue
+ result.append(re)
+ return result
+
+ def match_release_fuzzy_title_fuzzy_contrib(self, release):
+ """
+ Using elasticsearch fuzziness option (which is not that fuzzy).
+ """
+ if release.title is None or release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ ] + contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_release_generic(self, release):
+ """
+ Final catch all variant via title.
+ """
+ if release.title is None:
+ return []
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "OR",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ ],
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_release_generic_fuzzy_contrib(self, release):
+ """
+ Only match contribs, if they exist.
+ """
+ if release.contribs is None:
+ return []
+ contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_queries = [{
+ "match": {
+ "contrib_names": {
+ "query": token,
+ }
+ }
+ } for token in contrib_tokens]
+ query = {
+ "bool": {
+ "must": contrib_queries,
+ },
+ }
+ result = []
+ resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index)
+ if es_compat_hits_total(resp) == 0:
+ return result
+ if es_compat_hits_total(resp) > self.size:
+ self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp)))
+ entities = response_to_entity_list(resp,
+ entity_type=ReleaseEntity,
+ size=self.size,
+ api=self.api)
+ return entities
+
+ def match_cascade(self, release, *qs, **kwargs):
+ """
+ Returns the result from the first query that returns a result. All query
+ functions need to be defined on this class (for now).
+ """
+ for q in qs:
+ self.logger.debug("[cascade] {}".format(q))
+ result = q(release, **kwargs)
+ if len(result) > 0:
+ return result
+ return []
+
+ def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
+ """
+ Match returns a list of match candidates given a release entity.
+ """
+ if not release:
+ return []
+ return self.match_cascade(
+ release, self.match_release_by_id, self.match_release_exact_title_exact_contrib,
+ self.match_release_exact_title_partial_contrib,
+ self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title,
+ self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic,
+ self.match_release_generic_fuzzy_contrib)
+
+
def match_release_fuzzy(
release: ReleaseEntity,
size: int = 5,
@@ -28,6 +436,8 @@ def match_release_fuzzy(
fatcat using Elasticsearch.
TODO: rename "es" parameter to "es_client", which would be clearer
+
+ This is deprecated, move to matcher class.
"""
assert isinstance(release, ReleaseEntity)
@@ -41,6 +451,17 @@ def match_release_fuzzy(
if api is None:
api = public_api(FATCAT_API_URL)
+ # > query cascade
+ #
+ # [x] 1 exact ids
+ # [ ] 2 exact title and exact contrib
+ # [ ] 3 exact title and fuzzy contrib
+ # [ ] 4 exact title
+ # [ ] 5 title w/o stopwords, fuzzy contrib
+ # [ ] 6 title w/o stopwords
+ # [ ] 7 fuzzy title and fuzzy contrib
+ # [ ] 8 fuzzy whole document
+
# Try to match by external identifier.
# TODO: use api, ability to disable; benchmark
ext_ids = release.ext_ids
@@ -75,105 +496,89 @@ def match_release_fuzzy(
if release.title is not None and release.contribs is not None:
names = " ".join([c.raw_name for c in release.contribs])
- body = {
- "track_total_hits": True,
- "query": {
- "bool": {
- "must": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
- }
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
+ query = {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- ],
- },
+ }
+ },
+ ],
},
- "size": size,
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True)
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
- body = {
- "track_total_hits": True,
- "query": {
- "bool": {
- "should": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
+ query = {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- },
- ],
- },
+ }
+ },
+ ],
},
- "size": size,
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True)
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
# Note: If the title is short, we will get lots of results here; do we need
# to check for title length or result set length length or result set
# length here?
- body = {
- "track_total_hits": True,
- "query": {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- }
+ query = {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
}
- },
- "size": size,
+ }
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True)
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
# Get fuzzy.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
- body = {
- "track_total_hits": True,
- "query": {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
+ query = {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
}
- },
- "size": size,
+ }
}
- resp = es.search(body=body, index="fatcat_release")
+ resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True)
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)