aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/contrib.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/contrib.py')
-rw-r--r--fuzzycat/contrib.py453
1 files changed, 453 insertions, 0 deletions
diff --git a/fuzzycat/contrib.py b/fuzzycat/contrib.py
new file mode 100644
index 0000000..93753ab
--- /dev/null
+++ b/fuzzycat/contrib.py
@@ -0,0 +1,453 @@
+"""
+Contrib related comparisons.
+
+Example: NgramContribMatcher, which compares two normalized raw name tokens
+with a jaccard index.
+
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ Ngram(n=3),
+ set,
+ ]))
+ result = matcher.compare(a, b)
+ ...
+
+Some notes from the dataset.
+
+* 692,893,828 contribs
+* 64,680,311 uniq
+
+Top contrib names, many have their name on datasets, which explains the high
+number.
+
+3069752 Kessy Abarenkov
+2383819 Leho Tedersoo
+2383748 Karl-Henrik Larsson
+2383745 Urmas Kõljalg
+2383699 Mohammad Bahram
+2383692 Martin Ryberg
+2382832 R. Henrik Nilsson
+1702455 Markus Döring
+1702427 Tom May
+1702415 Dmitry Schigel
+1702391 Santiago Sánchez-Ramírez
+ 841475 GLIS Of The ITPGRFA
+ 682144 James Scott
+ 682053 Michael Weiss
+ 681404 Susumu Takamatsu
+ 681388 A. Elizabeth Arnold
+ 681347 Artur Alves
+ 681341 Ellen Larsson
+ 681338 Maarja Öpik
+ 681335 Ursula Eberhardt
+ 681324 Nhu Nguyen
+ 681293 Otto Miettinen
+ 681292 Viacheslav Spirin
+ 681287 Gareth W. Griffith
+ 681283 Bálint Dima
+ 681278 Ursula Peintner
+ 681276 Tuula Niskanen
+ 681276 Olinto Liparini Pereira
+ 681275 Kare Liimatainen
+"""
+
+import collections
+import functools
+import itertools
+import logging
+import operator
+import re
+import string
+from typing import Any, Callable, List, Optional, Set
+
+import jellyfish
+import thefuzz
+from fatcat_openapi_client import ReleaseContrib
+
+logger = logging.getLogger("fuzzycat")
+
+
+class Ngram:
+ """
+ Turn a string into a list of overlapping tokens.
+ """
+ def __init__(self, n: int = 3):
+ if n < 1:
+ raise ValueError("positive n required")
+ self.n = n
+
+ def __call__(self, s: str) -> List[str]:
+ if 0 < len(s) < self.n:
+ return [s]
+ return [s[i:i + self.n] for i in range(len(s) - self.n + 1)]
+
+
+class JaccardIndexThreshold:
+ """
+ A Jaccard index threshold that can be used to compare two sets. Two empty
+ sets are equal.
+ """
+ def __init__(self, threshold: float = 0.5, verbose=False):
+ self.threshold = threshold
+ self.verbose = verbose
+
+ def __call__(self, a: Set, b: Set) -> bool:
+ if len(a) == 0 and len(b) == 0:
+ return True
+ index = len(a & b) / len(a | b)
+ if self.verbose:
+ logger.debug("[jaccard] {}".format(index))
+ return index >= self.threshold
+
+
+class FuzzyStringSimilarity:
+ """
+ For two sets of strings, run fuzzy matching with "thefuzz" -
+ https://github.com/seatgeek/thefuzz, which among other things uses
+ Levenshtein distance.
+
+ The min ratio can range from 0 to 100 (with 100 allowing exact matches
+ only).
+ """
+ def __init__(self, min_ratio=75):
+ self.min_ratio = min_ratio
+
+ def __call__(self, a: Set, b: Set) -> bool:
+ agg = 0
+ for v in a:
+ match, score = thefuzz.exctractOne(v, b)
+ agg += score
+ return score > self.min_ratio
+
+
+class Pipeline:
+ """
+ A list of functions to execute, f -> g -> h, etc. Note that the output
+ type of f needs to match the input type of g, etc.
+ """
+ def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False):
+ self.verbose = verbose
+ if pipeline is None:
+ self.pipeline = [
+ lambda v: v,
+ ]
+ else:
+ self.pipeline = pipeline
+
+ def run(self, value: Any) -> Any:
+ v = value
+ for i, f in enumerate(self.pipeline, start=1):
+ v = f(v)
+ if self.verbose:
+ logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v))
+ return v
+
+ def __call__(self, value: Any, verbose: bool = False) -> Any:
+ self.verbose = verbose
+ return self.run(value)
+
+
+# default_release_contrib_pipeline normalizes the raw name.
+default_release_contrib_pipeline = Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+])
+
+# default_release_contrib_list_pipeline turns contribs list into a contrib set.
+default_release_contrib_list_pipeline = Pipeline([
+ lambda seq: set((c.raw_name for c in seq)),
+])
+
+
+class ContribMatcher:
+ """
+ Compare two contrib entities and determine a match status, based on some
+ configuration. The final values of the `pipeline` will be compared with
+ `cmp`, which by default is equality.
+
+ Other `cmp` options may generate ngrams and use jaccard index with some
+ threshold or decide on a string similarity metric.
+
+ This is essentially just a shell, the various comparison methods live in
+ the tuple (pipeline, cmp).
+ """
+ def __init__(self,
+ pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
+ cmp: Callable[[Any, Any], bool] = operator.__eq__):
+ self.pipeline = pipeline
+ self.cmp = cmp
+
+ def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool:
+ """
+ Compare returns True, if a and b are considered the same, given a
+ transformation pipeline and a comparison operation.
+ """
+ u = self.pipeline(a)
+ v = self.pipeline(b)
+ return self.cmp(u, v)
+
+
+class ContribListMatcher:
+ """
+ Compare two lists of contribs. Each contrib entry is passed through the
+ same pipeline.
+
+ Often two issues (separate or combined).
+
+ - contrib missing, e.g.
+ - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ...
+ """
+ def __init__(self,
+ pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
+ cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)):
+ self.pipeline = pipeline
+ self.cmp = cmp
+
+ def compare(self,
+ a: List[ReleaseContrib],
+ b: List[ReleaseContrib],
+ verbose: bool = False) -> bool:
+ """
+ Compare two lists of contribs, pass each one through the pipeline. The
+ result may be a list or any other type. The comparison function needs
+ to be compatible.
+ """
+ u = self.pipeline(a, verbose=verbose)
+ v = self.pipeline(b, verbose=verbose)
+ return self.cmp(u, v)
+
+
+def cleanup_single_ws(s: str) -> str:
+ return re.sub(r"[ ]{2,}", " ", s)
+
+
+def cleanup_remove_ws(s: str) -> str:
+ return re.sub(r"[\n\r\t\s]*", '', s)
+
+
+def cleanup_keep_letters_digits_ws(s: str) -> str:
+ return ''.join((c for c in s if c in string.ascii_letters + string.digits + " "))
+
+
+def test_cleanup_single_ws():
+ Case = collections.namedtuple("Case", "s result")
+ cases = (
+ Case("", ""),
+ Case("abc", "abc"),
+ Case("abc abc", "abc abc"),
+ Case("abc abc", "abc abc"),
+ Case(" abc abc", " abc abc"),
+ Case(" abc abc", " abc abc"),
+ )
+ for c in cases:
+ assert c.result == cleanup_single_ws(c.s)
+
+
+def test_cleanup_remove_ws():
+ Case = collections.namedtuple("Case", "s result")
+ cases = (
+ Case("", ""),
+ Case("abc", "abc"),
+ Case("abc abc", "abcabc"),
+ Case("abc abc", "abcabc"),
+ Case(" abc abc", "abcabc"),
+ )
+ for c in cases:
+ assert c.result == cleanup_remove_ws(c.s), c
+
+
+def test_ngram():
+ Case = collections.namedtuple("Case", "s n result")
+ cases = (
+ Case("", 1, []),
+ Case("", 2, []),
+ Case("a", 2, ["a"]),
+ Case("ab", 2, ["ab"]),
+ Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']),
+ Case("abcdef", 4, ['abcd', 'bcde', 'cdef']),
+ Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]),
+ )
+ for c in cases:
+ ngram = Ngram(n=c.n)
+ assert ngram(c.s) == c.result
+
+
+def test_pipeline():
+ Case = collections.namedtuple("Case", "pipeline input result")
+ cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower,
+ Ngram(n=3), set]), {"a": " X123 "}, {'123', 'x12'})),
+ for c in cases:
+ result = c.pipeline(c.input)
+ assert result == c.result
+
+
+def test_jaccard_index_threshold():
+ Case = collections.namedtuple("Case", "a b threshold result")
+ cases = (
+ Case(set(), set(), 1.0, True),
+ Case(set(), set(["a"]), 1.0, False),
+ Case(set(["a"]), set(["a"]), 1.0, True),
+ Case(set(["a"]), set(["a", "b"]), 1.0, False),
+ Case(set(["a"]), set(["a", "b"]), 0.5, True),
+ Case(set(["a"]), set(["a", "b", "c"]), 0.5, False),
+ )
+ for c in cases:
+ jit = JaccardIndexThreshold(threshold=c.threshold)
+ result = jit(c.a, c.b)
+ assert result == c.result
+
+
+def test_ngram_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ cleanup_remove_ws,
+ cleanup_keep_letters_digits_ws,
+ Ngram(n=3),
+ set,
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_jellyfish_soundex_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ functools.partial(re.sub, r"[.;]", " "),
+ cleanup_keep_letters_digits_ws,
+ lambda s: set((jellyfish.soundex(v) for v in s.split())),
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_jellyfish_nysiis_contrib_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ ReleaseContrib(raw_name="Jane Austen"),
+ ReleaseContrib(raw_name="J.Austen"),
+ True,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ Case(
+ ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
+ ReleaseContrib(raw_name="Fyodor Dostoevsky"),
+ False,
+ ),
+ )
+ matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
+ pipeline=Pipeline([
+ lambda rc: rc.raw_name,
+ str.strip,
+ str.lower,
+ functools.partial(re.sub, r"[.;]", " "),
+ cleanup_keep_letters_digits_ws,
+ lambda s: set((jellyfish.nysiis(v) for v in s.split())),
+ ],
+ verbose=True))
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b)
+ assert result == c.result
+
+
+def test_default_contrib_list_matcher(caplog):
+ Case = collections.namedtuple("Case", "a b result")
+ cases = (
+ Case(
+ [],
+ [],
+ True,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ True,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Michael Jordan")],
+ [ReleaseContrib(raw_name="michael jordan")],
+ False,
+ ),
+ Case(
+ [ReleaseContrib(raw_name="Amadeu Llebaria")],
+ [ReleaseContrib(raw_name="A. Llebaria")],
+ False,
+ ),
+ )
+ matcher = ContribListMatcher()
+ for c in cases:
+ with caplog.at_level(logging.DEBUG):
+ result = matcher.compare(c.a, c.b, verbose=True)
+ assert result == c.result