From 0c84af603894049dd8edd95da18d8990ab0516d1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 5 Nov 2021 17:19:07 +0100 Subject: turn "match_release_fuzzy" into a class Goal of this refactoring was to make the matching process a bit more configurable by using a class and a cascade of queries. For a limited test set: `FuzzyReleaseMatcher.match` is works the same as `match_release_fuzzy`. --- fuzzycat/contrib.py | 453 ++++++++++++++++++++++++++++++++ fuzzycat/grobid_unstructured.py | 2 +- fuzzycat/matching.py | 559 ++++++++++++++++++++++++++++++++++------ fuzzycat/sandcrawler.py | 5 +- fuzzycat/verify.py | 7 +- 5 files changed, 942 insertions(+), 84 deletions(-) create mode 100644 fuzzycat/contrib.py (limited to 'fuzzycat') diff --git a/fuzzycat/contrib.py b/fuzzycat/contrib.py new file mode 100644 index 0000000..93753ab --- /dev/null +++ b/fuzzycat/contrib.py @@ -0,0 +1,453 @@ +""" +Contrib related comparisons. + +Example: NgramContribMatcher, which compares two normalized raw name tokens +with a jaccard index. + + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + Ngram(n=3), + set, + ])) + result = matcher.compare(a, b) + ... + +Some notes from the dataset. + +* 692,893,828 contribs +* 64,680,311 uniq + +Top contrib names, many have their name on datasets, which explains the high +number. + +3069752 Kessy Abarenkov +2383819 Leho Tedersoo +2383748 Karl-Henrik Larsson +2383745 Urmas Kõljalg +2383699 Mohammad Bahram +2383692 Martin Ryberg +2382832 R. Henrik Nilsson +1702455 Markus Döring +1702427 Tom May +1702415 Dmitry Schigel +1702391 Santiago Sánchez-Ramírez + 841475 GLIS Of The ITPGRFA + 682144 James Scott + 682053 Michael Weiss + 681404 Susumu Takamatsu + 681388 A. Elizabeth Arnold + 681347 Artur Alves + 681341 Ellen Larsson + 681338 Maarja Öpik + 681335 Ursula Eberhardt + 681324 Nhu Nguyen + 681293 Otto Miettinen + 681292 Viacheslav Spirin + 681287 Gareth W. Griffith + 681283 Bálint Dima + 681278 Ursula Peintner + 681276 Tuula Niskanen + 681276 Olinto Liparini Pereira + 681275 Kare Liimatainen +""" + +import collections +import functools +import itertools +import logging +import operator +import re +import string +from typing import Any, Callable, List, Optional, Set + +import jellyfish +import thefuzz +from fatcat_openapi_client import ReleaseContrib + +logger = logging.getLogger("fuzzycat") + + +class Ngram: + """ + Turn a string into a list of overlapping tokens. + """ + def __init__(self, n: int = 3): + if n < 1: + raise ValueError("positive n required") + self.n = n + + def __call__(self, s: str) -> List[str]: + if 0 < len(s) < self.n: + return [s] + return [s[i:i + self.n] for i in range(len(s) - self.n + 1)] + + +class JaccardIndexThreshold: + """ + A Jaccard index threshold that can be used to compare two sets. Two empty + sets are equal. + """ + def __init__(self, threshold: float = 0.5, verbose=False): + self.threshold = threshold + self.verbose = verbose + + def __call__(self, a: Set, b: Set) -> bool: + if len(a) == 0 and len(b) == 0: + return True + index = len(a & b) / len(a | b) + if self.verbose: + logger.debug("[jaccard] {}".format(index)) + return index >= self.threshold + + +class FuzzyStringSimilarity: + """ + For two sets of strings, run fuzzy matching with "thefuzz" - + https://github.com/seatgeek/thefuzz, which among other things uses + Levenshtein distance. + + The min ratio can range from 0 to 100 (with 100 allowing exact matches + only). + """ + def __init__(self, min_ratio=75): + self.min_ratio = min_ratio + + def __call__(self, a: Set, b: Set) -> bool: + agg = 0 + for v in a: + match, score = thefuzz.exctractOne(v, b) + agg += score + return score > self.min_ratio + + +class Pipeline: + """ + A list of functions to execute, f -> g -> h, etc. Note that the output + type of f needs to match the input type of g, etc. + """ + def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False): + self.verbose = verbose + if pipeline is None: + self.pipeline = [ + lambda v: v, + ] + else: + self.pipeline = pipeline + + def run(self, value: Any) -> Any: + v = value + for i, f in enumerate(self.pipeline, start=1): + v = f(v) + if self.verbose: + logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v)) + return v + + def __call__(self, value: Any, verbose: bool = False) -> Any: + self.verbose = verbose + return self.run(value) + + +# default_release_contrib_pipeline normalizes the raw name. +default_release_contrib_pipeline = Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, +]) + +# default_release_contrib_list_pipeline turns contribs list into a contrib set. +default_release_contrib_list_pipeline = Pipeline([ + lambda seq: set((c.raw_name for c in seq)), +]) + + +class ContribMatcher: + """ + Compare two contrib entities and determine a match status, based on some + configuration. The final values of the `pipeline` will be compared with + `cmp`, which by default is equality. + + Other `cmp` options may generate ngrams and use jaccard index with some + threshold or decide on a string similarity metric. + + This is essentially just a shell, the various comparison methods live in + the tuple (pipeline, cmp). + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = operator.__eq__): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool: + """ + Compare returns True, if a and b are considered the same, given a + transformation pipeline and a comparison operation. + """ + u = self.pipeline(a) + v = self.pipeline(b) + return self.cmp(u, v) + + +class ContribListMatcher: + """ + Compare two lists of contribs. Each contrib entry is passed through the + same pipeline. + + Often two issues (separate or combined). + + - contrib missing, e.g. + - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ... + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, + a: List[ReleaseContrib], + b: List[ReleaseContrib], + verbose: bool = False) -> bool: + """ + Compare two lists of contribs, pass each one through the pipeline. The + result may be a list or any other type. The comparison function needs + to be compatible. + """ + u = self.pipeline(a, verbose=verbose) + v = self.pipeline(b, verbose=verbose) + return self.cmp(u, v) + + +def cleanup_single_ws(s: str) -> str: + return re.sub(r"[ ]{2,}", " ", s) + + +def cleanup_remove_ws(s: str) -> str: + return re.sub(r"[\n\r\t\s]*", '', s) + + +def cleanup_keep_letters_digits_ws(s: str) -> str: + return ''.join((c for c in s if c in string.ascii_letters + string.digits + " ")) + + +def test_cleanup_single_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abc abc"), + Case("abc abc", "abc abc"), + Case(" abc abc", " abc abc"), + Case(" abc abc", " abc abc"), + ) + for c in cases: + assert c.result == cleanup_single_ws(c.s) + + +def test_cleanup_remove_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abcabc"), + Case("abc abc", "abcabc"), + Case(" abc abc", "abcabc"), + ) + for c in cases: + assert c.result == cleanup_remove_ws(c.s), c + + +def test_ngram(): + Case = collections.namedtuple("Case", "s n result") + cases = ( + Case("", 1, []), + Case("", 2, []), + Case("a", 2, ["a"]), + Case("ab", 2, ["ab"]), + Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']), + Case("abcdef", 4, ['abcd', 'bcde', 'cdef']), + Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]), + ) + for c in cases: + ngram = Ngram(n=c.n) + assert ngram(c.s) == c.result + + +def test_pipeline(): + Case = collections.namedtuple("Case", "pipeline input result") + cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower, + Ngram(n=3), set]), {"a": " X123 "}, {'123', 'x12'})), + for c in cases: + result = c.pipeline(c.input) + assert result == c.result + + +def test_jaccard_index_threshold(): + Case = collections.namedtuple("Case", "a b threshold result") + cases = ( + Case(set(), set(), 1.0, True), + Case(set(), set(["a"]), 1.0, False), + Case(set(["a"]), set(["a"]), 1.0, True), + Case(set(["a"]), set(["a", "b"]), 1.0, False), + Case(set(["a"]), set(["a", "b"]), 0.5, True), + Case(set(["a"]), set(["a", "b", "c"]), 0.5, False), + ) + for c in cases: + jit = JaccardIndexThreshold(threshold=c.threshold) + result = jit(c.a, c.b) + assert result == c.result + + +def test_ngram_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + cleanup_remove_ws, + cleanup_keep_letters_digits_ws, + Ngram(n=3), + set, + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_soundex_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.soundex(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_nysiis_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.nysiis(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_default_contrib_list_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + [], + [], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="Michael Jordan")], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="michael jordan")], + False, + ), + Case( + [ReleaseContrib(raw_name="Amadeu Llebaria")], + [ReleaseContrib(raw_name="A. Llebaria")], + False, + ), + ) + matcher = ContribListMatcher() + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b, verbose=True) + assert result == c.result diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py index 7470cd7..1765f42 100644 --- a/fuzzycat/grobid_unstructured.py +++ b/fuzzycat/grobid_unstructured.py @@ -15,7 +15,7 @@ from typing import Optional import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds -from grobid_tei_xml import parse_citation_xml, GrobidBiblio +from grobid_tei_xml import GrobidBiblio, parse_citation_xml from fuzzycat.config import settings from fuzzycat.utils import clean_doi diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b358899..1531ac6 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,22 +1,430 @@ +import logging import os import re import sys -from typing import List, Optional, Type, Union +from typing import Any, List, Optional, Type, Union import elasticsearch import elasticsearch_dsl import fatcat_openapi_client import requests -from fatcat_openapi_client import ContainerEntity, DefaultApi, ReleaseEntity +from fatcat_openapi_client import (ContainerEntity, DefaultApi, ReleaseContrib, ReleaseEntity) from fatcat_openapi_client.rest import ApiException from fuzzycat.config import settings +from fuzzycat.contrib import (ContribListMatcher, FuzzyStringSimilarity, JaccardIndexThreshold, + Pipeline) from fuzzycat.entities import entity_from_dict, entity_from_json from fuzzycat.utils import es_compat_hits_total FATCAT_API_URL = settings.get("FATCAT_API_URL", "https://api.fatcat.wiki/v0") +class FuzzyReleaseMatcher: + """ + FuzzyReleaseMatcher tries to find similar items to a given release in + elasticsearch. Exact matches first, then fuzzy. + + In the best case, elasticsearch would automatically rank the most relevant + docs first, even with partial data. We still try to steer the matches by + using a query cascade. This is configurable. The last query should be a + generic. + + The goal here is to get a set of potential matches; verification has to. + happen separately. + + TODO: + + Example case not yet working well ("Stuehrenberg" vs "Stührenberg"): + + >>> result = matcher.match(entity_from_dict({"title": "internet archive", + "contribs": [{"raw_name": + "Stührenberg"}], + "ext_ids": {}}, + ReleaseEntity)) + + > Should return: https://fatcat.wiki/release/pu7e7tbctna2foqyyxztfw3ufy, + https://fatcat.wiki/release/search?q=St%C3%BChrenberg+internet+archive&generic=1 + (not returning anything via frontend either) + + Make sure we can switch from function to class: + + * [ ] 5 test cases for both + + """ + def __init__(self, es="https://search.fatcat.wiki", api=None, index="fatcat_release", size=10): + if isinstance(es, str): + self.es = elasticsearch.Elasticsearch([es]) + else: + self.es = es if es else elasticsearch.Elasticsearch() + self.api = api if api else public_api(FATCAT_API_URL) + self.index = index + self.size = size + self.logger = logging.getLogger("fuzzy") + + def match_release_by_id(self, release, **kwargs) -> List[ReleaseEntity]: + """ + Check for exact matches by identifier. + """ + ext_ids = release.ext_ids + attrs = ( + "doi", + "pmid", + "wikidata_qid", + "core", + "pmcid", + "arxiv", + "dblp", + "doaj", + "jstor", + "isbn13", + "ark", + "mag", + "oai", + ) + for attr in attrs: + value = getattr(ext_ids, attr) + if not value: + continue + try: + r = self.api.lookup_release(**{attr: value}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status in [404, 400]: + r = None + else: + raise err + if r: + return [r] + return [] + + def match_release_exact_title_exact_contrib(self, release): + """ + Match exact title and exact contrib names. Case insensitive, order of + contribs does not matter. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + hits_total = es_compat_hits_total(resp) + if hits_total == 0: + return result + if hits_total > self.size: + self.logger.warn('more than {} hits: {}'.format(self.size, hits_total)) + + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require overlap of contrib. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(1.0), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_partial_contrib(self, release): + """ + Allow for exact authors, but ok, if some are missing. + """ + if release.title is None or release.contribs is None: + return [] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": contrib.raw_name, + "operator": "AND", + } + } + } for contrib in release.contribs] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('result set too large: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + # Require at least half the contribs to be shared. + matcher = ContribListMatcher( + cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title_fuzzy_contrib(self, release): + """ + Exact title but ok it authors differ (slightly). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise NotImplementedError('todo: scroll required for larger result sets: {}'.format(es)) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + + matcher = ContribListMatcher( + cmp=FuzzyStringSimilarity(min_ratio=60), + pipeline=Pipeline([ + lambda contribs: set((c.raw_name.strip().lower() for c in contribs)), + ]), + ) + + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + if not matcher.compare(re.contribs, release.contribs): + continue + result.append(re) + return result + + def match_release_exact_title(self, release): + """ + Exact title, but any author. For common titles, this will yield 100s or + 1000s or results. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [{ + "match": { + "title": { + "query": release.title, + "operator": "AND", + }, + } + }], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + for re in entities: + if re.title.strip().lower() != release.title.strip().lower(): + continue + result.append(re) + return result + + def match_release_fuzzy_title_fuzzy_contrib(self, release): + """ + Using elasticsearch fuzziness option (which is not that fuzzy). + """ + if release.title is None or release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + ] + contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + raise ValueError('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic(self, release): + """ + Final catch all variant via title. + """ + if release.title is None: + return [] + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "OR", + "fuzziness": "AUTO", + }, + } + }, + ], + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_release_generic_fuzzy_contrib(self, release): + """ + Only match contribs, if they exist. + """ + if release.contribs is None: + return [] + contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()] + contrib_queries = [{ + "match": { + "contrib_names": { + "query": token, + } + } + } for token in contrib_tokens] + query = { + "bool": { + "must": contrib_queries, + }, + } + result = [] + resp = self.es.search(query=query, size=self.size, track_total_hits=True, index=self.index) + if es_compat_hits_total(resp) == 0: + return result + if es_compat_hits_total(resp) > self.size: + self.logger.warn('too many hits: {}'.format(es_compat_hits_total(resp))) + entities = response_to_entity_list(resp, + entity_type=ReleaseEntity, + size=self.size, + api=self.api) + return entities + + def match_cascade(self, release, *qs, **kwargs): + """ + Returns the result from the first query that returns a result. All query + functions need to be defined on this class (for now). + """ + for q in qs: + self.logger.debug("[cascade] {}".format(q)) + result = q(release, **kwargs) + if len(result) > 0: + return result + return [] + + def match(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]: + """ + Match returns a list of match candidates given a release entity. + """ + if not release: + return [] + return self.match_cascade( + release, self.match_release_by_id, self.match_release_exact_title_exact_contrib, + self.match_release_exact_title_partial_contrib, + self.match_release_exact_title_fuzzy_contrib, self.match_release_exact_title, + self.match_release_fuzzy_title_fuzzy_contrib, self.match_release_generic, + self.match_release_generic_fuzzy_contrib) + + def match_release_fuzzy( release: ReleaseEntity, size: int = 5, @@ -28,6 +436,8 @@ def match_release_fuzzy( fatcat using Elasticsearch. TODO: rename "es" parameter to "es_client", which would be clearer + + This is deprecated, move to matcher class. """ assert isinstance(release, ReleaseEntity) @@ -41,6 +451,17 @@ def match_release_fuzzy( if api is None: api = public_api(FATCAT_API_URL) + # > query cascade + # + # [x] 1 exact ids + # [ ] 2 exact title and exact contrib + # [ ] 3 exact title and fuzzy contrib + # [ ] 4 exact title + # [ ] 5 title w/o stopwords, fuzzy contrib + # [ ] 6 title w/o stopwords + # [ ] 7 fuzzy title and fuzzy contrib + # [ ] 8 fuzzy whole document + # Try to match by external identifier. # TODO: use api, ability to disable; benchmark ext_ids = release.ext_ids @@ -75,105 +496,89 @@ def match_release_fuzzy( if release.title is not None and release.contribs is not None: names = " ".join([c.raw_name for c in release.contribs]) - body = { - "track_total_hits": True, - "query": { - "bool": { - "must": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, - } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "bool": { + "must": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - body = { - "track_total_hits": True, - "query": { - "bool": { - "should": [ - { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - }, + query = { + "bool": { + "should": [ + { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + }, + } + }, + { + "match": { + "contrib_names": { + "query": names, + "operator": "AND", + "fuzziness": "AUTO", } - }, - { - "match": { - "contrib_names": { - "query": names, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - ], - }, + } + }, + ], }, - "size": size, } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Note: If the title is short, we will get lots of results here; do we need # to check for title length or result set length length or result set # length here? - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) # Get fuzzy. # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "track_total_hits": True, - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } + query = { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", } - }, - "size": size, + } } - resp = es.search(body=body, index="fatcat_release") + resp = es.search(query=query, index="fatcat_release", size=size, track_total_hits=True) if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py index 958756a..63b85e6 100644 --- a/fuzzycat/sandcrawler.py +++ b/fuzzycat/sandcrawler.py @@ -1,6 +1,7 @@ -import regex import unicodedata +import regex + # from http://zderadicka.eu/removing-diacritics-marks-from-strings/ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter AE}': 'AE', @@ -63,6 +64,7 @@ SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" ) + def sandcrawler_slugify(raw: str) -> str: """ Python re-implementation of sandcrawler Scala code for string comparison @@ -155,4 +157,3 @@ def test_sandcrawler_slugify() -> None: print(unicodedata.name(c)) print(in_str) assert sandcrawler_slugify(in_str) == out_str - diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9eb808b..f570511 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -233,10 +233,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED) - if a_title == b_title and ((dict_has_key(a, "extra.subtitle") - and not dict_has_key(b, "extra.subtitle")) or - (dict_has_key(b, "extra.subtitle") - and not dict_has_key(a, "extra.subtitle"))): + if a_title == b_title and ( + (dict_has_key(a, "extra.subtitle") and not dict_has_key(b, "extra.subtitle")) or + (dict_has_key(b, "extra.subtitle") and not dict_has_key(a, "extra.subtitle"))): return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC) except PathAccessError: pass -- cgit v1.2.3