aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-12 11:43:38 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-12 11:43:38 +0200
commitee05baed7b35df6918a8899b955222748ac21161 (patch)
tree48ab03f55ab73125c2382c58afa5e969516d9303
parent031e1a70dab2d8c6bcc93ce64a7cc42d5eb7fcda (diff)
downloadfuzzycat-ee05baed7b35df6918a8899b955222748ac21161.tar.gz
fuzzycat-ee05baed7b35df6918a8899b955222748ac21161.zip
add basic str utils
-rw-r--r--fuzzycat/main.py1
-rw-r--r--fuzzycat/utils.py82
2 files changed, 83 insertions, 0 deletions
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5a6f4a7..8da283b 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -1,4 +1,5 @@
from fuzzycat import __version__
+
def main():
print("hello fuzzycat {}".format(__version__))
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
new file mode 100644
index 0000000..e3e04c0
--- /dev/null
+++ b/fuzzycat/utils.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+
+import collections
+from typing import Any, Callable, DefaultDict, Dict, List
+
+"""
+A couple of utilities, may be split up into separate modules.
+"""
+
+
+class StringPipeline:
+ """
+ Minimalistic grouping of functions applied on an input string to produce
+ some cleaned or normalized output. Pipeline functions are Func[[str], str].
+
+ >>> cleanups = StringPipeline([
+ ... str.lower,
+ ... remove_html_tags,
+ ... normalize_whitespace,
+ ... normalize_ampersand,
+ ... ])
+ >>> cleanups.run("<a>Input & Output</a>")
+ input and output
+
+ """
+
+ def __init__(self, fs: List[Callable[[str], str]]):
+ self.fs = fs
+
+ def run(self, s: str) -> str:
+ """
+ Apply all function and return result.
+ """
+ for f in self.fs:
+ s = f(s)
+ return s
+
+
+class StringAnnotator:
+ """
+ Experimental, rationale: In some way, feature engineering; we want to
+ derive metrics, number from the string, do this consistently and compactly.
+ E.g. once we have dozens of "speaking" characteristics, a case based method
+ might become more readble.
+
+ if s.is_single_token and s.some_ratio > 0.4:
+ return MatchStatus.AMBIGIOUS
+
+ Could also subclass string and pluck more methods on it (might be even
+ reusable).
+
+ ....
+
+ Given a string, derive a couple of metrics, based on functions. The
+ annotation is a dict, mapping an annotation key to a value of any type.
+
+ >>> metrics = StringAnnotator([
+ ... has_html_tags,
+ ... has_only_printable_characters,
+ ... is_single_token,
+ ... length,
+ ... has_year_in_parentheses,
+ ... ])
+ >>> metrics.run("Journal of Pataphysics 2038-2032")
+ {"value": "Journal of Pataphysics 2038-2032", "is_single_token": False, ... }
+
+ TODO(martin):
+
+ * SimpleNamespace, dotdict, Dataclass.
+ * string_utils.py or similar
+ * maybe adopt SpaCy or similar
+ """
+
+ def __init__(self, fs: List[Callable[[str], Dict[str, Any]]]):
+ self.fs = fs
+
+ def run(self, s: str) -> Dict[str, Any]:
+ annotations: DefaultDict[str, Any] = collections.defaultdict(dict)
+ for f in self.fs:
+ result = f(s)
+ annotations.update(result)
+ return annotations