aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py249
1 files changed, 0 insertions, 249 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
deleted file mode 100644
index 9d2a2f7..0000000
--- a/fuzzycat/utils.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# coding: utf-8
-
-import collections
-import itertools
-import json
-import re
-import string
-from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence
-"""
-A couple of utilities, may be split up into separate modules.
-"""
-
-
-class SetEncoder(json.JSONEncoder):
- """
- Helper to encode python sets into JSON lists.
- So you can write something like this:
- json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder)
- """
- def default(self, obj):
- """
- Decorate call to standard implementation.
- """
- if isinstance(obj, set):
- return list(obj)
- return json.JSONEncoder.default(self, obj)
-
-
-class StringPipeline:
- """
- Minimalistic grouping of functions applied on an input string to produce
- some cleaned or normalized output. Pipeline functions are Func[[str], str].
-
- >>> cleanups = StringPipeline([
- ... str.lower,
- ... remove_html_tags,
- ... normalize_whitespace,
- ... normalize_ampersand,
- ... ])
- >>> cleanups("<a>Input & Output</a>")
- input and output
-
- """
- def __init__(self, fs: List[Callable[[str], str]]):
- self.fs = fs
-
- def __call__(self, s: str) -> str:
- return self.run(s)
-
- def run(self, s: str) -> str:
- """
- Apply all function and return result. Deprecated: just call the object.
- """
- for f in self.fs:
- s = f(s)
- return s
-
-
-class StringAnnotator:
- """
- Experimental, rationale: In some way, feature engineering; we want to
- derive metrics, number from the string, do this consistently and compactly.
- E.g. once we have dozens of "speaking" characteristics, a case based method
- might become more readble.
-
- if s.is_single_token and s.some_ratio > 0.4:
- return MatchStatus.AMBIGIOUS
-
- Could also subclass string and pluck more methods on it (might be even
- reusable).
-
- ....
-
- Given a string, derive a couple of metrics, based on functions. The
- annotation is a dict, mapping an annotation key to a value of any type.
-
- >>> metrics = StringAnnotator([
- ... has_html_tags,
- ... has_only_printable_characters,
- ... is_single_token,
- ... length,
- ... has_year_in_parentheses,
- ... ])
- >>> metrics.run("Journal of Pataphysics 2038-2032")
- {"value": "Journal of Pataphysics 2038-2032", "is_single_token": False, ... }
-
- TODO(martin):
-
- * SimpleNamespace, dotdict, Dataclass.
- * string_utils.py or similar
- * maybe adopt SpaCy or similar
- """
- def __init__(self, fs: List[Callable[[str], Dict[str, Any]]]):
- self.fs = fs
-
- def run(self, s: str) -> Dict[str, Any]:
- annotations: DefaultDict[str, Any] = collections.defaultdict(dict)
- for f in self.fs:
- result = f(s)
- annotations.update(result)
- return annotations
-
-
-def normalize_whitespace(s: str) -> str:
- """
- Remove trailing spaces and normalize whitespace.
- """
- return re.sub(r"\s{2,}", " ", s.strip())
-
-
-def normalize_ampersand(s: str) -> str:
- """
- Normalize ampersand to and.
- """
- return s.replace(" & ", " and ")
-
-
-def letter_to_non_letter_ratio(s: str) -> float:
- """
- Non letters are defined by printable w/o letters.
- """
- if len(s) == 0:
- return 0.0
- non_letters = set(string.printable) - set(string.ascii_letters)
- non_letter_count = sum(c in non_letters for c in s)
- return non_letter_count / len(s)
-
-
-def alphanumeric_ratio(s: str) -> float:
- """
- Ratio of letters, digit and whitespace to total string length.
- """
- if len(s) == 0:
- return 0.0
- alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "])
- alphanumeric_count = sum(c in alphanumeric for c in s)
- return alphanumeric_count / len(s)
-
-
-def alphanumeric_only(s: str) -> str:
- """
- Remove all non-alphanumeric content from string.
- """
- alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "])
- return "".join((c for c in s if c in alphanumeric))
-
-
-def parenthesized_year(s: str) -> Optional[str]:
- """
- Return the year only, if it is in parentheses, e.g. Hello (2020).
- """
- match = re.search(r"[\(\[]\s*([12][\d]{3})\s*[\]\)]", s)
- if match:
- return match.group(1)
- return None
-
-
-def has_non_letters_ratio(s: str, threshold: float = 0.4) -> bool:
- """
- Check the ratio of non-letters in a string, e.g. for things like "A.R.G.H"
- """
- if len(s) == 0:
- return False
- return (sum(c not in string.ascii_letters for c in s) / len(s)) > threshold
-
-
-def is_single_word_printable(s: str) -> bool:
- """
- True, if s is a single token of printable characters.
- """
- return all(c in string.printable for c in s) and s.split() == 1
-
-
-def extract_wikidata_qids(s: str) -> List[str]:
- """
- Given a string, extract all qids.
- """
- return re.findall(r"Q[0-9]{1,10}", s)
-
-
-def extract_issns(s: str) -> List[str]:
- """
- Given a string return a list of valid ISSN.
- """
- pattern = r"[0-9]{4,4}-[0-9]{3,3}[0-9xX]"
- return [v for v in re.findall(pattern, s) if is_valid_issn(v)]
-
-
-def longest_common_prefix(a: Sequence, b: Sequence) -> Sequence:
- """
- Return the longest common prefix of a and b. The length of the return value
- is at most min(len(a), len(b)).
- """
- a, b = sorted((a, b), key=len)
- for i, (u, v) in enumerate(zip(a, b)):
- if u != v:
- return a[:i]
- return a
-
-
-def common_prefix_length_ratio(a: Sequence, b: Sequence) -> float:
- """
- Return a float between 0.0 and 1.0 expressing the ratio between the length
- of the common shared prefix to the length of the longest sequence.
- """
- maxlen = max(len(a), len(b))
- if maxlen == 0:
- return 0.0
- return len(longest_common_prefix(a, b)) / maxlen
-
-
-def hamming_distance(s: str, t: str) -> int:
- """
- Return hamming distance of s and t.
- """
- return sum((u != v for u, v in itertools.zip_longest(s, t)))
-
-
-def calculate_issn_checkdigit(s: str) -> str:
- """
- Given a string of length 7, return the ISSN check value (digit or X) as
- string.
- """
- if len(s) != 7:
- raise ValueError("seven digits required")
- ss = sum((int(digit) * f for digit, f in zip(s, range(8, 1, -1))))
- _, mod = divmod(ss, 11)
- checkdigit = 0 if mod == 0 else 11 - mod
- result = "X" if checkdigit == 10 else "{}".format(checkdigit)
- return result
-
-
-def is_valid_issn(issn: str) -> bool:
- """
- Return True, if the ISSN is valid. This does not mean it is registered.
- """
- if "-" in issn:
- issn = issn.replace("-", "")
- if len(issn) != 8:
- raise ValueError("invalid issn length: {}".format(issn))
- checkdigit = calculate_issn_checkdigit(issn[:7])
- return issn[7] == "{}".format(checkdigit)
-
-
-def keys_with_values(d: Dict) -> List[Any]:
- """
- Return all keys of a dictionary which have non-falsy values.
- """
- return [k for k, v in d.items() if v]