"""
Contrib related comparisons.

Example: NgramContribMatcher, which compares two normalized raw name tokens
with a jaccard index.

    matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.5),
                             pipeline=Pipeline([
                                 lambda rc: rc.raw_name,
                                 str.strip,
                                 str.lower,
                                 Ngram(n=3),
                                 set,
                             ]))
    result = matcher.compare(a, b)
    ...

Some notes from the dataset.

* 692,893,828 contribs
*  64,680,311 uniq

Top contrib names, many have their name on datasets, which explains the high
number.

3069752 Kessy Abarenkov
2383819 Leho Tedersoo
2383748 Karl-Henrik Larsson
2383745 Urmas Kõljalg
2383699 Mohammad Bahram
2383692 Martin Ryberg
2382832 R. Henrik Nilsson
1702455 Markus Döring
1702427 Tom May
1702415 Dmitry Schigel
1702391 Santiago Sánchez-Ramírez
 841475 GLIS Of The ITPGRFA
 682144 James Scott
 682053 Michael Weiss
 681404 Susumu Takamatsu
 681388 A. Elizabeth Arnold
 681347 Artur Alves
 681341 Ellen Larsson
 681338 Maarja Öpik
 681335 Ursula Eberhardt
 681324 Nhu Nguyen
 681293 Otto Miettinen
 681292 Viacheslav Spirin
 681287 Gareth W. Griffith
 681283 Bálint Dima
 681278 Ursula Peintner
 681276 Tuula Niskanen
 681276 Olinto Liparini Pereira
 681275 Kare Liimatainen
"""

import collections
import functools
import itertools
import logging
import operator
import re
import string
from typing import Any, Callable, List, Optional, Set

import jellyfish
import thefuzz
from fatcat_openapi_client import ReleaseContrib

logger = logging.getLogger("fuzzycat")


class Ngram:
    """
    Turn a string into a list of overlapping tokens.
    """
    def __init__(self, n: int = 3):
        if n < 1:
            raise ValueError("positive n required")
        self.n = n

    def __call__(self, s: str) -> List[str]:
        if 0 < len(s) < self.n:
            return [s]
        return [s[i:i + self.n] for i in range(len(s) - self.n + 1)]


class JaccardIndexThreshold:
    """
    A Jaccard index threshold that can be used to compare two sets. Two empty
    sets are equal.
    """
    def __init__(self, threshold: float = 0.5, verbose=False):
        self.threshold = threshold
        self.verbose = verbose

    def __call__(self, a: Set, b: Set) -> bool:
        if len(a) == 0 and len(b) == 0:
            return True
        index = len(a & b) / len(a | b)
        if self.verbose:
            logger.debug("[jaccard] {}".format(index))
        return index >= self.threshold


class FuzzyStringSimilarity:
    """
    For two sets of strings, run fuzzy matching with "thefuzz" -
    https://github.com/seatgeek/thefuzz, which among other things uses
    Levenshtein distance.

    The min ratio can range from 0 to 100 (with 100 allowing exact matches
    only).
    """
    def __init__(self, min_ratio=75):
        self.min_ratio = min_ratio

    def __call__(self, a: Set, b: Set) -> bool:
        agg = 0
        for v in a:
            match, score = thefuzz.exctractOne(v, b)
            agg += score
        return score > self.min_ratio


class Pipeline:
    """
    A list of functions to execute, f -> g -> h, etc. Note that the output
    type of f needs to match the input type of g, etc.
    """
    def __init__(self, pipeline: Optional[List[Any]] = None, verbose: bool = False):
        self.verbose = verbose
        if pipeline is None:
            self.pipeline = [
                lambda v: v,
            ]
        else:
            self.pipeline = pipeline

    def run(self, value: Any) -> Any:
        v = value
        for i, f in enumerate(self.pipeline, start=1):
            v = f(v)
            if self.verbose:
                logger.debug("[{}/{}] {}".format(i, len(self.pipeline), v))
        return v

    def __call__(self, value: Any, verbose: bool = False) -> Any:
        self.verbose = verbose
        return self.run(value)


# default_release_contrib_pipeline normalizes the raw name.
default_release_contrib_pipeline = Pipeline([
    lambda rc: rc.raw_name,
    str.strip,
    str.lower,
])

# default_release_contrib_list_pipeline turns contribs list into a contrib set.
default_release_contrib_list_pipeline = Pipeline([
    lambda seq: set((c.raw_name for c in seq)),
])


class ContribMatcher:
    """
    Compare two contrib entities and determine a match status, based on some
    configuration. The final values of the `pipeline` will be compared with
    `cmp`, which by default is equality.

    Other `cmp` options may generate ngrams and use jaccard index with some
    threshold or decide on a string similarity metric.

    This is essentially just a shell, the various comparison methods live in
    the tuple (pipeline, cmp).
    """
    def __init__(self,
                 pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
                 cmp: Callable[[Any, Any], bool] = operator.__eq__):
        self.pipeline = pipeline
        self.cmp = cmp

    def compare(self, a: ReleaseContrib, b: ReleaseContrib) -> bool:
        """
        Compare returns True, if a and b are considered the same, given a
        transformation pipeline and a comparison operation.
        """
        u = self.pipeline(a)
        v = self.pipeline(b)
        return self.cmp(u, v)


class ContribListMatcher:
    """
    Compare two lists of contribs. Each contrib entry is passed through the
    same pipeline.

    Often two issues (separate or combined).

    - contrib missing, e.g.
    - "Gentle Sunder Shrestha", "Gentle S Shrestha", "S. Shrestha", "Gentle Shrestha", ...
    """
    def __init__(self,
                 pipeline: Optional[List[Any]] = default_release_contrib_list_pipeline,
                 cmp: Callable[[Any, Any], bool] = JaccardIndexThreshold(1.0)):
        self.pipeline = pipeline
        self.cmp = cmp

    def compare(self,
                a: List[ReleaseContrib],
                b: List[ReleaseContrib],
                verbose: bool = False) -> bool:
        """
        Compare two lists of contribs, pass each one through the pipeline. The
        result may be a list or any other type. The comparison function needs
        to be compatible.
        """
        u = self.pipeline(a, verbose=verbose)
        v = self.pipeline(b, verbose=verbose)
        return self.cmp(u, v)


def cleanup_single_ws(s: str) -> str:
    return re.sub(r"[ ]{2,}", " ", s)


def cleanup_remove_ws(s: str) -> str:
    return re.sub(r"[\n\r\t\s]*", '', s)


def cleanup_keep_letters_digits_ws(s: str) -> str:
    return ''.join((c for c in s if c in string.ascii_letters + string.digits + " "))


def test_cleanup_single_ws():
    Case = collections.namedtuple("Case", "s result")
    cases = (
        Case("", ""),
        Case("abc", "abc"),
        Case("abc abc", "abc abc"),
        Case("abc  abc", "abc abc"),
        Case("   abc  abc", " abc abc"),
        Case("   abc  abc", " abc abc"),
    )
    for c in cases:
        assert c.result == cleanup_single_ws(c.s)


def test_cleanup_remove_ws():
    Case = collections.namedtuple("Case", "s result")
    cases = (
        Case("", ""),
        Case("abc", "abc"),
        Case("abc abc", "abcabc"),
        Case("abc  abc", "abcabc"),
        Case("   abc  abc", "abcabc"),
    )
    for c in cases:
        assert c.result == cleanup_remove_ws(c.s), c


def test_ngram():
    Case = collections.namedtuple("Case", "s n result")
    cases = (
        Case("", 1, []),
        Case("", 2, []),
        Case("a", 2, ["a"]),
        Case("ab", 2, ["ab"]),
        Case("abcdef", 2, ['ab', 'bc', 'cd', 'de', 'ef']),
        Case("abcdef", 4, ['abcd', 'bcde', 'cdef']),
        Case("Nina Rogo", 3, ["Nin", "ina", "na ", "a R", " Ro", "Rog", "ogo"]),
    )
    for c in cases:
        ngram = Ngram(n=c.n)
        assert ngram(c.s) == c.result


def test_pipeline():
    Case = collections.namedtuple("Case", "pipeline input result")
    cases = (Case(Pipeline([lambda v: v["a"], str.strip, str.lower,
                            Ngram(n=3), set]), {"a": " X123  "}, {'123', 'x12'})),
    for c in cases:
        result = c.pipeline(c.input)
        assert result == c.result


def test_jaccard_index_threshold():
    Case = collections.namedtuple("Case", "a b threshold result")
    cases = (
        Case(set(), set(), 1.0, True),
        Case(set(), set(["a"]), 1.0, False),
        Case(set(["a"]), set(["a"]), 1.0, True),
        Case(set(["a"]), set(["a", "b"]), 1.0, False),
        Case(set(["a"]), set(["a", "b"]), 0.5, True),
        Case(set(["a"]), set(["a", "b", "c"]), 0.5, False),
    )
    for c in cases:
        jit = JaccardIndexThreshold(threshold=c.threshold)
        result = jit(c.a, c.b)
        assert result == c.result


def test_ngram_contrib_matcher(caplog):
    Case = collections.namedtuple("Case", "a b result")
    cases = (
        Case(
            ReleaseContrib(raw_name="Jane Austen"),
            ReleaseContrib(raw_name="J.Austen"),
            True,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
    )
    matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.4, verbose=True),
                             pipeline=Pipeline([
                                 lambda rc: rc.raw_name,
                                 str.strip,
                                 str.lower,
                                 cleanup_remove_ws,
                                 cleanup_keep_letters_digits_ws,
                                 Ngram(n=3),
                                 set,
                             ],
                                               verbose=True))
    for c in cases:
        with caplog.at_level(logging.DEBUG):
            result = matcher.compare(c.a, c.b)
            assert result == c.result


def test_jellyfish_soundex_contrib_matcher(caplog):
    Case = collections.namedtuple("Case", "a b result")
    cases = (
        Case(
            ReleaseContrib(raw_name="Jane Austen"),
            ReleaseContrib(raw_name="J.Austen"),
            True,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
    )
    matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
                             pipeline=Pipeline([
                                 lambda rc: rc.raw_name,
                                 str.strip,
                                 str.lower,
                                 functools.partial(re.sub, r"[.;]", " "),
                                 cleanup_keep_letters_digits_ws,
                                 lambda s: set((jellyfish.soundex(v) for v in s.split())),
                             ],
                                               verbose=True))
    for c in cases:
        with caplog.at_level(logging.DEBUG):
            result = matcher.compare(c.a, c.b)
            assert result == c.result


def test_jellyfish_nysiis_contrib_matcher(caplog):
    Case = collections.namedtuple("Case", "a b result")
    cases = (
        Case(
            ReleaseContrib(raw_name="Jane Austen"),
            ReleaseContrib(raw_name="J.Austen"),
            True,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor M. Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
        Case(
            ReleaseContrib(raw_name="Fjodor Michailowitsch Dostojewski"),
            ReleaseContrib(raw_name="Fyodor Dostoevsky"),
            False,
        ),
    )
    matcher = ContribMatcher(cmp=JaccardIndexThreshold(0.3, verbose=True),
                             pipeline=Pipeline([
                                 lambda rc: rc.raw_name,
                                 str.strip,
                                 str.lower,
                                 functools.partial(re.sub, r"[.;]", " "),
                                 cleanup_keep_letters_digits_ws,
                                 lambda s: set((jellyfish.nysiis(v) for v in s.split())),
                             ],
                                               verbose=True))
    for c in cases:
        with caplog.at_level(logging.DEBUG):
            result = matcher.compare(c.a, c.b)
            assert result == c.result


def test_default_contrib_list_matcher(caplog):
    Case = collections.namedtuple("Case", "a b result")
    cases = (
        Case(
            [],
            [],
            True,
        ),
        Case(
            [ReleaseContrib(raw_name="Michael Jordan")],
            [ReleaseContrib(raw_name="Michael Jordan")],
            True,
        ),
        Case(
            [ReleaseContrib(raw_name="Michael Jordan")],
            [ReleaseContrib(raw_name="michael jordan")],
            False,
        ),
        Case(
            [ReleaseContrib(raw_name="Amadeu Llebaria")],
            [ReleaseContrib(raw_name="A. Llebaria")],
            False,
        ),
    )
    matcher = ContribListMatcher()
    for c in cases:
        with caplog.at_level(logging.DEBUG):
            result = matcher.compare(c.a, c.b, verbose=True)
            assert result == c.result