From ee05baed7b35df6918a8899b955222748ac21161 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 Aug 2020 11:43:38 +0200 Subject: add basic str utils --- fuzzycat/utils.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 fuzzycat/utils.py (limited to 'fuzzycat/utils.py') diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py new file mode 100644 index 0000000..e3e04c0 --- /dev/null +++ b/fuzzycat/utils.py @@ -0,0 +1,82 @@ +# coding: utf-8 + +import collections +from typing import Any, Callable, DefaultDict, Dict, List + +""" +A couple of utilities, may be split up into separate modules. +""" + + +class StringPipeline: + """ + Minimalistic grouping of functions applied on an input string to produce + some cleaned or normalized output. Pipeline functions are Func[[str], str]. + + >>> cleanups = StringPipeline([ + ... str.lower, + ... remove_html_tags, + ... normalize_whitespace, + ... normalize_ampersand, + ... ]) + >>> cleanups.run("Input & Output") + input and output + + """ + + def __init__(self, fs: List[Callable[[str], str]]): + self.fs = fs + + def run(self, s: str) -> str: + """ + Apply all function and return result. + """ + for f in self.fs: + s = f(s) + return s + + +class StringAnnotator: + """ + Experimental, rationale: In some way, feature engineering; we want to + derive metrics, number from the string, do this consistently and compactly. + E.g. once we have dozens of "speaking" characteristics, a case based method + might become more readble. + + if s.is_single_token and s.some_ratio > 0.4: + return MatchStatus.AMBIGIOUS + + Could also subclass string and pluck more methods on it (might be even + reusable). + + .... + + Given a string, derive a couple of metrics, based on functions. The + annotation is a dict, mapping an annotation key to a value of any type. + + >>> metrics = StringAnnotator([ + ... has_html_tags, + ... has_only_printable_characters, + ... is_single_token, + ... length, + ... has_year_in_parentheses, + ... ]) + >>> metrics.run("Journal of Pataphysics 2038-2032") + {"value": "Journal of Pataphysics 2038-2032", "is_single_token": False, ... } + + TODO(martin): + + * SimpleNamespace, dotdict, Dataclass. + * string_utils.py or similar + * maybe adopt SpaCy or similar + """ + + def __init__(self, fs: List[Callable[[str], Dict[str, Any]]]): + self.fs = fs + + def run(self, s: str) -> Dict[str, Any]: + annotations: DefaultDict[str, Any] = collections.defaultdict(dict) + for f in self.fs: + result = f(s) + annotations.update(result) + return annotations -- cgit v1.2.3