diff options
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r-- | fuzzycat/utils.py | 249 |
1 files changed, 0 insertions, 249 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py deleted file mode 100644 index 9d2a2f7..0000000 --- a/fuzzycat/utils.py +++ /dev/null @@ -1,249 +0,0 @@ -# coding: utf-8 - -import collections -import itertools -import json -import re -import string -from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence -""" -A couple of utilities, may be split up into separate modules. -""" - - -class SetEncoder(json.JSONEncoder): - """ - Helper to encode python sets into JSON lists. - So you can write something like this: - json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder) - """ - def default(self, obj): - """ - Decorate call to standard implementation. - """ - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) - - -class StringPipeline: - """ - Minimalistic grouping of functions applied on an input string to produce - some cleaned or normalized output. Pipeline functions are Func[[str], str]. - - >>> cleanups = StringPipeline([ - ... str.lower, - ... remove_html_tags, - ... normalize_whitespace, - ... normalize_ampersand, - ... ]) - >>> cleanups("<a>Input & Output</a>") - input and output - - """ - def __init__(self, fs: List[Callable[[str], str]]): - self.fs = fs - - def __call__(self, s: str) -> str: - return self.run(s) - - def run(self, s: str) -> str: - """ - Apply all function and return result. Deprecated: just call the object. - """ - for f in self.fs: - s = f(s) - return s - - -class StringAnnotator: - """ - Experimental, rationale: In some way, feature engineering; we want to - derive metrics, number from the string, do this consistently and compactly. - E.g. once we have dozens of "speaking" characteristics, a case based method - might become more readble. - - if s.is_single_token and s.some_ratio > 0.4: - return MatchStatus.AMBIGIOUS - - Could also subclass string and pluck more methods on it (might be even - reusable). - - .... - - Given a string, derive a couple of metrics, based on functions. The - annotation is a dict, mapping an annotation key to a value of any type. - - >>> metrics = StringAnnotator([ - ... has_html_tags, - ... has_only_printable_characters, - ... is_single_token, - ... length, - ... has_year_in_parentheses, - ... ]) - >>> metrics.run("Journal of Pataphysics 2038-2032") - {"value": "Journal of Pataphysics 2038-2032", "is_single_token": False, ... } - - TODO(martin): - - * SimpleNamespace, dotdict, Dataclass. - * string_utils.py or similar - * maybe adopt SpaCy or similar - """ - def __init__(self, fs: List[Callable[[str], Dict[str, Any]]]): - self.fs = fs - - def run(self, s: str) -> Dict[str, Any]: - annotations: DefaultDict[str, Any] = collections.defaultdict(dict) - for f in self.fs: - result = f(s) - annotations.update(result) - return annotations - - -def normalize_whitespace(s: str) -> str: - """ - Remove trailing spaces and normalize whitespace. - """ - return re.sub(r"\s{2,}", " ", s.strip()) - - -def normalize_ampersand(s: str) -> str: - """ - Normalize ampersand to and. - """ - return s.replace(" & ", " and ") - - -def letter_to_non_letter_ratio(s: str) -> float: - """ - Non letters are defined by printable w/o letters. - """ - if len(s) == 0: - return 0.0 - non_letters = set(string.printable) - set(string.ascii_letters) - non_letter_count = sum(c in non_letters for c in s) - return non_letter_count / len(s) - - -def alphanumeric_ratio(s: str) -> float: - """ - Ratio of letters, digit and whitespace to total string length. - """ - if len(s) == 0: - return 0.0 - alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) - alphanumeric_count = sum(c in alphanumeric for c in s) - return alphanumeric_count / len(s) - - -def alphanumeric_only(s: str) -> str: - """ - Remove all non-alphanumeric content from string. - """ - alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) - return "".join((c for c in s if c in alphanumeric)) - - -def parenthesized_year(s: str) -> Optional[str]: - """ - Return the year only, if it is in parentheses, e.g. Hello (2020). - """ - match = re.search(r"[\(\[]\s*([12][\d]{3})\s*[\]\)]", s) - if match: - return match.group(1) - return None - - -def has_non_letters_ratio(s: str, threshold: float = 0.4) -> bool: - """ - Check the ratio of non-letters in a string, e.g. for things like "A.R.G.H" - """ - if len(s) == 0: - return False - return (sum(c not in string.ascii_letters for c in s) / len(s)) > threshold - - -def is_single_word_printable(s: str) -> bool: - """ - True, if s is a single token of printable characters. - """ - return all(c in string.printable for c in s) and s.split() == 1 - - -def extract_wikidata_qids(s: str) -> List[str]: - """ - Given a string, extract all qids. - """ - return re.findall(r"Q[0-9]{1,10}", s) - - -def extract_issns(s: str) -> List[str]: - """ - Given a string return a list of valid ISSN. - """ - pattern = r"[0-9]{4,4}-[0-9]{3,3}[0-9xX]" - return [v for v in re.findall(pattern, s) if is_valid_issn(v)] - - -def longest_common_prefix(a: Sequence, b: Sequence) -> Sequence: - """ - Return the longest common prefix of a and b. The length of the return value - is at most min(len(a), len(b)). - """ - a, b = sorted((a, b), key=len) - for i, (u, v) in enumerate(zip(a, b)): - if u != v: - return a[:i] - return a - - -def common_prefix_length_ratio(a: Sequence, b: Sequence) -> float: - """ - Return a float between 0.0 and 1.0 expressing the ratio between the length - of the common shared prefix to the length of the longest sequence. - """ - maxlen = max(len(a), len(b)) - if maxlen == 0: - return 0.0 - return len(longest_common_prefix(a, b)) / maxlen - - -def hamming_distance(s: str, t: str) -> int: - """ - Return hamming distance of s and t. - """ - return sum((u != v for u, v in itertools.zip_longest(s, t))) - - -def calculate_issn_checkdigit(s: str) -> str: - """ - Given a string of length 7, return the ISSN check value (digit or X) as - string. - """ - if len(s) != 7: - raise ValueError("seven digits required") - ss = sum((int(digit) * f for digit, f in zip(s, range(8, 1, -1)))) - _, mod = divmod(ss, 11) - checkdigit = 0 if mod == 0 else 11 - mod - result = "X" if checkdigit == 10 else "{}".format(checkdigit) - return result - - -def is_valid_issn(issn: str) -> bool: - """ - Return True, if the ISSN is valid. This does not mean it is registered. - """ - if "-" in issn: - issn = issn.replace("-", "") - if len(issn) != 8: - raise ValueError("invalid issn length: {}".format(issn)) - checkdigit = calculate_issn_checkdigit(issn[:7]) - return issn[7] == "{}".format(checkdigit) - - -def keys_with_values(d: Dict) -> List[Any]: - """ - Return all keys of a dictionary which have non-falsy values. - """ - return [k for k, v in d.items() if v] |