3 files changed, 165 insertions, 0 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 485f44a..7536cd4 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1 +1,3 @@
 __version__ = "0.1.1"
+
+from status import MatchStatus
diff --git a/fuzzycat/status.py b/fuzzycat/status.py
new file mode 100644
index 0000000..19c1817
--- /dev/null
+++ b/fuzzycat/status.py
@@ -0,0 +1,12 @@
+class MatchStatus(Enum):
+    """
+    When matching two entities, use these levels to express match strength.
+    When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
+    certain, that items do not match.
+    """
+
+    EXACT = 0
+    STRONG = 1
+    WEAK = 2
+    AMBIGIOUS = 3
+    DIFFERENT = 4
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index e3e04c0..fb98dcd 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,6 +1,9 @@
 # coding: utf-8
 
 import collections
+import itertools
+import re
+import string
 from typing import Any, Callable, DefaultDict, Dict, List
 
 """
@@ -80,3 +83,151 @@ class StringAnnotator:
             result = f(s)
             annotations.update(result)
         return annotations
+
+
+def normalize_whitespace(s: str) -> str:
+    """
+    Remove trailing spaces and normalize whitespace.
+    """
+    return re.sub(r"\s{2,}", " ", s.strip())
+
+
+def normalize_ampersand(s: str) -> str:
+    """
+    Normalize ampersand to and.
+    """
+    return s.replace(" & ", " and ")
+
+
+def letter_to_non_letter_ratio(s: str) -> float:
+    """
+    Non letters are defined by printable w/o letters.
+    """
+    if len(s) == 0:
+        return 0.0
+    non_letters = set(string.printable) - set(string.ascii_letters)
+    non_letter_count = sum(c in non_letters for c in s)
+    return non_letter_count / len(s)
+
+
+def alphanumeric_ratio(s: str) -> float:
+    """
+    Ratio of letters, digit and whitespace to total string length.
+    """
+    if len(s) == 0:
+        return 0.0
+    alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "])
+    alphanumeric_count = sum(c in alphanumeric for c in s)
+    return alphanumeric_count / len(s)
+
+
+def alphanumeric_only(s: str) -> str:
+    """
+    Remove all non-alphanumeric content from string.
+    """
+    alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "])
+    return "".join((c for c in s if c in alphanumeric))
+
+
+def parenthesized_year(s: str) -> Optional[str]:
+    """
+    Return the year only, if it is in parentheses, e.g. Hello (2020).
+    """
+    match = re.search(r"[\(\[]\s*([12][\d]{3})\s*[\]\)]", s)
+    if match:
+        return match.group(1)
+    return None
+
+
+def has_non_letters_ratio(s: str, threshold: float = 0.4) -> bool:
+    """
+    Check the ratio of non-letters in a string, e.g. for things like "A.R.G.H"
+    """
+    if len(s) == 0:
+        return False
+    return (sum(c not in string.ascii_letters for c in s) / len(s)) > threshold
+
+
+def is_single_word_printable(s: str) -> bool:
+    """
+    True, if s is a single token of printable characters.
+    """
+    return all(c in string.printable for c in s) and s.split() == 1
+
+
+def extract_wikidata_qids(s: str) -> List[str]:
+    """
+    Given a string, extract all qids.
+    """
+    return re.findall(r"Q[0-9]{1,10}", s)
+
+
+def extract_issns(s: str) -> List[str]:
+    """
+    Given a string return a list of valid ISSN.
+    """
+    pattern = r"[0-9]{4,4}-[0-9]{3,3}[0-9xX]"
+    return [v for v in re.findall(pattern, s) if is_valid_issn(v)]
+
+
+def longest_common_prefix(a: Sequence, b: Sequence) -> Sequence:
+    """
+    Return the longest common prefix of a and b. The length of the return value
+    is at most min(len(a), len(b)).
+    """
+    a, b = sorted((a, b), key=len)
+    for i, (u, v) in enumerate(zip(a, b)):
+        if u != v:
+            return a[:i]
+    return a
+
+
+def common_prefix_length_ratio(a: Sequence, b: Sequence) -> float:
+    """
+    Return a float between 0.0 and 1.0 expressing the ratio between the length
+    of the common shared prefix to the length of the longest sequence.
+    """
+    maxlen = max(len(a), len(b))
+    if maxlen == 0:
+        return 0.0
+    return len(longest_common_prefix(a, b)) / maxlen
+
+
+def hamming_distance(s: str, t: str) -> int:
+    """
+    Return hamming distance of s and t.
+    """
+    return sum((u != v for u, v in itertools.zip_longest(s, t)))
+
+
+def calculate_issn_checkdigit(s: str) -> str:
+    """
+    Given a string of length 7, return the ISSN check value (digit or X) as
+    string.
+    """
+    if len(s) != 7:
+        raise ValueError("seven digits required")
+    ss = sum((int(digit) * f for digit, f in zip(s, range(8, 1, -1))))
+    _, mod = divmod(ss, 11)
+    checkdigit = 0 if mod == 0 else 11 - mod
+    result = "X" if checkdigit == 10 else "{}".format(checkdigit)
+    return result
+
+
+def is_valid_issn(issn: str) -> bool:
+    """
+    Return True, if the ISSN is valid. This does not mean it is registered.
+    """
+    if "-" in issn:
+        issn = issn.replace("-", "")
+    if len(issn) != 8:
+        raise ValueError("invalid issn length: {}".format(issn))
+    checkdigit = calculate_issn_checkdigit(issn[:7])
+    return issn[7] == "{}".format(checkdigit)
+
+
+def keys_with_values(d: Dict) -> List[Any]:
+    """
+    Return all keys of a dictionary which have non-falsy values.
+    """
+    return [k for k, v in d.items() if v]