diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-12 11:49:03 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-12 11:49:03 +0200 |
commit | 67d7e2bf3f2cef0fd87ba77c86679f83aac8b2cb (patch) | |
tree | d61deada2bd3e7e4eed64ec16455f0d3f9b10097 | |
parent | 9a84b1bfc5cd6b6349ec62070163e8ffbe7671eb (diff) | |
download | fuzzycat-67d7e2bf3f2cef0fd87ba77c86679f83aac8b2cb.tar.gz fuzzycat-67d7e2bf3f2cef0fd87ba77c86679f83aac8b2cb.zip |
import utility functions
-rw-r--r-- | fuzzycat/__init__.py | 2 | ||||
-rw-r--r-- | fuzzycat/status.py | 12 | ||||
-rw-r--r-- | fuzzycat/utils.py | 151 |
3 files changed, 165 insertions, 0 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py index 485f44a..7536cd4 100644 --- a/fuzzycat/__init__.py +++ b/fuzzycat/__init__.py @@ -1 +1,3 @@ __version__ = "0.1.1" + +from status import MatchStatus diff --git a/fuzzycat/status.py b/fuzzycat/status.py new file mode 100644 index 0000000..19c1817 --- /dev/null +++ b/fuzzycat/status.py @@ -0,0 +1,12 @@ +class MatchStatus(Enum): + """ + When matching two entities, use these levels to express match strength. + When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is + certain, that items do not match. + """ + + EXACT = 0 + STRONG = 1 + WEAK = 2 + AMBIGIOUS = 3 + DIFFERENT = 4 diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index e3e04c0..fb98dcd 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,6 +1,9 @@ # coding: utf-8 import collections +import itertools +import re +import string from typing import Any, Callable, DefaultDict, Dict, List """ @@ -80,3 +83,151 @@ class StringAnnotator: result = f(s) annotations.update(result) return annotations + + +def normalize_whitespace(s: str) -> str: + """ + Remove trailing spaces and normalize whitespace. + """ + return re.sub(r"\s{2,}", " ", s.strip()) + + +def normalize_ampersand(s: str) -> str: + """ + Normalize ampersand to and. + """ + return s.replace(" & ", " and ") + + +def letter_to_non_letter_ratio(s: str) -> float: + """ + Non letters are defined by printable w/o letters. + """ + if len(s) == 0: + return 0.0 + non_letters = set(string.printable) - set(string.ascii_letters) + non_letter_count = sum(c in non_letters for c in s) + return non_letter_count / len(s) + + +def alphanumeric_ratio(s: str) -> float: + """ + Ratio of letters, digit and whitespace to total string length. + """ + if len(s) == 0: + return 0.0 + alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) + alphanumeric_count = sum(c in alphanumeric for c in s) + return alphanumeric_count / len(s) + + +def alphanumeric_only(s: str) -> str: + """ + Remove all non-alphanumeric content from string. + """ + alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) + return "".join((c for c in s if c in alphanumeric)) + + +def parenthesized_year(s: str) -> Optional[str]: + """ + Return the year only, if it is in parentheses, e.g. Hello (2020). + """ + match = re.search(r"[\(\[]\s*([12][\d]{3})\s*[\]\)]", s) + if match: + return match.group(1) + return None + + +def has_non_letters_ratio(s: str, threshold: float = 0.4) -> bool: + """ + Check the ratio of non-letters in a string, e.g. for things like "A.R.G.H" + """ + if len(s) == 0: + return False + return (sum(c not in string.ascii_letters for c in s) / len(s)) > threshold + + +def is_single_word_printable(s: str) -> bool: + """ + True, if s is a single token of printable characters. + """ + return all(c in string.printable for c in s) and s.split() == 1 + + +def extract_wikidata_qids(s: str) -> List[str]: + """ + Given a string, extract all qids. + """ + return re.findall(r"Q[0-9]{1,10}", s) + + +def extract_issns(s: str) -> List[str]: + """ + Given a string return a list of valid ISSN. + """ + pattern = r"[0-9]{4,4}-[0-9]{3,3}[0-9xX]" + return [v for v in re.findall(pattern, s) if is_valid_issn(v)] + + +def longest_common_prefix(a: Sequence, b: Sequence) -> Sequence: + """ + Return the longest common prefix of a and b. The length of the return value + is at most min(len(a), len(b)). + """ + a, b = sorted((a, b), key=len) + for i, (u, v) in enumerate(zip(a, b)): + if u != v: + return a[:i] + return a + + +def common_prefix_length_ratio(a: Sequence, b: Sequence) -> float: + """ + Return a float between 0.0 and 1.0 expressing the ratio between the length + of the common shared prefix to the length of the longest sequence. + """ + maxlen = max(len(a), len(b)) + if maxlen == 0: + return 0.0 + return len(longest_common_prefix(a, b)) / maxlen + + +def hamming_distance(s: str, t: str) -> int: + """ + Return hamming distance of s and t. + """ + return sum((u != v for u, v in itertools.zip_longest(s, t))) + + +def calculate_issn_checkdigit(s: str) -> str: + """ + Given a string of length 7, return the ISSN check value (digit or X) as + string. + """ + if len(s) != 7: + raise ValueError("seven digits required") + ss = sum((int(digit) * f for digit, f in zip(s, range(8, 1, -1)))) + _, mod = divmod(ss, 11) + checkdigit = 0 if mod == 0 else 11 - mod + result = "X" if checkdigit == 10 else "{}".format(checkdigit) + return result + + +def is_valid_issn(issn: str) -> bool: + """ + Return True, if the ISSN is valid. This does not mean it is registered. + """ + if "-" in issn: + issn = issn.replace("-", "") + if len(issn) != 8: + raise ValueError("invalid issn length: {}".format(issn)) + checkdigit = calculate_issn_checkdigit(issn[:7]) + return issn[7] == "{}".format(checkdigit) + + +def keys_with_values(d: Dict) -> List[Any]: + """ + Return all keys of a dictionary which have non-falsy values. + """ + return [k for k, v in d.items() if v] |