diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-11-05 17:19:07 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-11-16 18:58:42 +0100 |
commit | 0c84af603894049dd8edd95da18d8990ab0516d1 (patch) | |
tree | 08fb4ad2b3a498e2edac73972f97e427e0194759 /fuzzycat/contrib.py | |
parent | 282f315c6ba3643c8c614220ab2f7e1d55de3658 (diff) | |
download | fuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.tar.gz fuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.zip |
turn "match_release_fuzzy" into a class
Goal of this refactoring was to make the matching process a bit more
configurable by using a class and a cascade of queries.
For a limited test set: `FuzzyReleaseMatcher.match` is works the same as
`match_release_fuzzy`.
Diffstat (limited to 'fuzzycat/contrib.py')
-rw-r--r-- | fuzzycat/contrib.py | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/fuzzycat/contrib.py b/fuzzycat/contrib.py new file mode 100644 index 0000000..93753ab --- /dev/null +++ b/fuzzycat/contrib.py @@ -0,0 +1,453 @@ +""" +Contrib related comparisons. + +Example: NgramContribMatcher, which compares two normalized raw name tokens +with a jaccard index. + + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + Ngram(n=3), + set, + ])) + result = matcher.compare(a, b) + ... + +Some notes from the dataset. + +* 692,893,828 contribs +* 64,680,311 uniq + +Top contrib names, many have their name on datasets, which explains the high +number. + +3069752 Kessy Abarenkov +2383819 Leho Tedersoo +2383748 Karl-Henrik Larsson +2383745 Urmas Kõljalg +2383699 Mohammad Bahram +2383692 Martin Ryberg +2382832 R. Henrik Nilsson +1702455 Markus Döring +1702427 Tom May +1702415 Dmitry Schigel +1702391 Santiago Sánchez-Ramírez + 841475 GLIS Of The ITPGRFA + 682144 James Scott + 682053 Michael Weiss + 681404 Susumu Takamatsu + 681388 A. Elizabeth Arnold + 681347 Artur Alves + 681341 Ellen Larsson + 681338 Maarja Öpik + 681335 Ursula Eberhardt + 681324 Nhu Nguyen + 681293 Otto Miettinen + 681292 Viacheslav Spirin + 681287 Gareth W. Griffith + 681283 Bálint Dima + 681278 Ursula Peintner + 681276 Tuula Niskanen + 681276 Olinto Liparini Pereira + 681275 Kare Liimatainen +""" + +import collections +import functools +import itertools +import logging +import operator +import re +import string +from typing import Any, Callable, List, Optional, Set + +import jellyfish +import thefuzz +from fatcat_openapi_client import ReleaseContrib + +logger = logging.getLogger("fuzzycat") + + +class Ngram: + """ + Turn a string into a list of overlapping tokens. + """ + def __init__(self, n: int = 3): + if n < 1: + raise ValueError("positive n required") + self.n = n + + def __call__(self, s: str) -> List[str]: + if 0 < len(s) < self.n: + return [s] + return [s[i:i + self.n] for i in range(len(s) - self.n + 1)] + + +class JaccardIndexThreshold: + """ + A Jaccard index threshold that can be used to compare two sets. Two empty + sets are equal. + """ + def __init__(self, threshold: float = 0.5, verbose=False): + self.threshold = threshold + self.verbose = verbose + + def __call__(self, a: Set, b: Set) -> bool: + if len(a) == 0 and len(b) == 0: + return True + index = len(a & b) / len(a | b) + if self.verbose: + logger.debug("[jaccard] {}".format(index)) + return index >= self.threshold + + +class FuzzyStringSimilarity: + """ + For two sets of strings, run fuzzy matching with "thefuzz" - + https://github.com/seatgeek/thefuzz, which among other things uses + Levenshtein distance. + + The min ratio can range from 0 to 100 (with 100 allowing exact matches + only). + """ + def __init__(self, min_ratio=75): + self.min_ratio = min_ratio + + def __call__(self, a: Set, b: Set) -> bool: + agg = 0 + for v in a: + match, score = thefuzz.exctractOne(v, b) + agg += score + return score > self.min_ratio + + +class Pipeline: + """ + A list of functions to execute, f -> g -> h, etc. Note that the output + type of f needs to match the input type of g, etc. + """ + def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False): + self.verbose = verbose + if pipeline is None: + self.pipeline = [ + lambda v: v, + ] + else: + self.pipeline = pipeline + + def run(self, value: Any) -> Any: + v = value + for i, f in enumerate(self.pipeline, start=1): + v = f(v) + if self.verbose: + logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v)) + return v + + def __call__(self, value: Any, verbose: bool = False) -> Any: + self.verbose = verbose + return self.run(value) + + +# default_release_contrib_pipeline normalizes the raw name. +default_release_contrib_pipeline = Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, +]) + +# default_release_contrib_list_pipeline turns contribs list into a contrib set. +default_release_contrib_list_pipeline = Pipeline([ + lambda seq: set((c.raw_name for c in seq)), +]) + + +class ContribMatcher: + """ + Compare two contrib entities and determine a match status, based on some + configuration. The final values of the `pipeline` will be compared with + `cmp`, which by default is equality. + + Other `cmp` options may generate ngrams and use jaccard index with some + threshold or decide on a string similarity metric. + + This is essentially just a shell, the various comparison methods live in + the tuple (pipeline, cmp). + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = operator.__eq__): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool: + """ + Compare returns True, if a and b are considered the same, given a + transformation pipeline and a comparison operation. + """ + u = self.pipeline(a) + v = self.pipeline(b) + return self.cmp(u, v) + + +class ContribListMatcher: + """ + Compare two lists of contribs. Each contrib entry is passed through the + same pipeline. + + Often two issues (separate or combined). + + - contrib missing, e.g. + - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ... + """ + def __init__(self, + pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline, + cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)): + self.pipeline = pipeline + self.cmp = cmp + + def compare(self, + a: List[ReleaseContrib], + b: List[ReleaseContrib], + verbose: bool = False) -> bool: + """ + Compare two lists of contribs, pass each one through the pipeline. The + result may be a list or any other type. The comparison function needs + to be compatible. + """ + u = self.pipeline(a, verbose=verbose) + v = self.pipeline(b, verbose=verbose) + return self.cmp(u, v) + + +def cleanup_single_ws(s: str) -> str: + return re.sub(r"[ ]{2,}", " ", s) + + +def cleanup_remove_ws(s: str) -> str: + return re.sub(r"[\n\r\t\s]*", '', s) + + +def cleanup_keep_letters_digits_ws(s: str) -> str: + return ''.join((c for c in s if c in string.ascii_letters + string.digits + " ")) + + +def test_cleanup_single_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abc abc"), + Case("abc abc", "abc abc"), + Case(" abc abc", " abc abc"), + Case(" abc abc", " abc abc"), + ) + for c in cases: + assert c.result == cleanup_single_ws(c.s) + + +def test_cleanup_remove_ws(): + Case = collections.namedtuple("Case", "s result") + cases = ( + Case("", ""), + Case("abc", "abc"), + Case("abc abc", "abcabc"), + Case("abc abc", "abcabc"), + Case(" abc abc", "abcabc"), + ) + for c in cases: + assert c.result == cleanup_remove_ws(c.s), c + + +def test_ngram(): + Case = collections.namedtuple("Case", "s n result") + cases = ( + Case("", 1, []), + Case("", 2, []), + Case("a", 2, ["a"]), + Case("ab", 2, ["ab"]), + Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']), + Case("abcdef", 4, ['abcd', 'bcde', 'cdef']), + Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]), + ) + for c in cases: + ngram = Ngram(n=c.n) + assert ngram(c.s) == c.result + + +def test_pipeline(): + Case = collections.namedtuple("Case", "pipeline input result") + cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower, + Ngram(n=3), set]), {"a": " X123 "}, {'123', 'x12'})), + for c in cases: + result = c.pipeline(c.input) + assert result == c.result + + +def test_jaccard_index_threshold(): + Case = collections.namedtuple("Case", "a b threshold result") + cases = ( + Case(set(), set(), 1.0, True), + Case(set(), set(["a"]), 1.0, False), + Case(set(["a"]), set(["a"]), 1.0, True), + Case(set(["a"]), set(["a", "b"]), 1.0, False), + Case(set(["a"]), set(["a", "b"]), 0.5, True), + Case(set(["a"]), set(["a", "b", "c"]), 0.5, False), + ) + for c in cases: + jit = JaccardIndexThreshold(threshold=c.threshold) + result = jit(c.a, c.b) + assert result == c.result + + +def test_ngram_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + cleanup_remove_ws, + cleanup_keep_letters_digits_ws, + Ngram(n=3), + set, + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_soundex_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.soundex(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_jellyfish_nysiis_contrib_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + ReleaseContrib(raw_name="Jane Austen"), + ReleaseContrib(raw_name="J.Austen"), + True, + ), + Case( + ReleaseContrib(raw_name="Fjodor Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor M. Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + Case( + ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"), + ReleaseContrib(raw_name="Fyodor Dostoevsky"), + False, + ), + ) + matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True), + pipeline=Pipeline([ + lambda rc: rc.raw_name, + str.strip, + str.lower, + functools.partial(re.sub, r"[.;]", " "), + cleanup_keep_letters_digits_ws, + lambda s: set((jellyfish.nysiis(v) for v in s.split())), + ], + verbose=True)) + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b) + assert result == c.result + + +def test_default_contrib_list_matcher(caplog): + Case = collections.namedtuple("Case", "a b result") + cases = ( + Case( + [], + [], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="Michael Jordan")], + True, + ), + Case( + [ReleaseContrib(raw_name="Michael Jordan")], + [ReleaseContrib(raw_name="michael jordan")], + False, + ), + Case( + [ReleaseContrib(raw_name="Amadeu Llebaria")], + [ReleaseContrib(raw_name="A. Llebaria")], + False, + ), + ) + matcher = ContribListMatcher() + for c in cases: + with caplog.at_level(logging.DEBUG): + result = matcher.compare(c.a, c.b, verbose=True) + assert result == c.result |