add tss key option

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-10 17:49:24 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-10 17:49:24 +0100
commit: 219fded46071a98434e28ca100c1a5cbc33fcba1 (patch)
tree: 1be8ced5279ee7f79a697bf0178b35f0db53273a
parent: 414ea00bdb51766a27375e6ca058e4178fb71b1b (diff)
download: fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.tar.gz
fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.zip
3 files changed, 33 insertions, 14 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
index 687c17e..e49e7d7 100644
--- a/fuzzycat/build.py
+++ b/fuzzycat/build.py
@@ -1,5 +1,5 @@
 """
-Build auxiliary data structures.
+WIP: Build auxiliary data structures for lookup.
 """
 
 import fileinput
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index db20320..fa54ec6 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -52,32 +52,38 @@ class KeyDoc(BaseModel):
 get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
+printable_no_punct = string.digits + string.letters + string.whitespace
+
+def slugify_string(s: str) -> str:
+    """
+    Keeps ascii chars and single whitespace only.
+    """
+    return ''.join((c for c in s.lower() if c in printable_no_punct))
 
 # Notes: untie from release_entity, as we are only using a few fields. Maybe
 # it's a jsob blob, with a pydantic spec and schema.
 
 
 def release_key_title(doc: KeyDoc) -> Tuple[str, str]:
-    id, title = get_ident_title(doc)
+    ident, title = get_ident_title(doc)
     if not title:
-        raise ValueError('title missing')
+        raise ValueError('title missing for {}'.format(ident))
     title = title.translate(ws_replacer).strip()
-    return (id, title)
+    return (ident, title)
 
 
 def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
-    id, title = release_key_title(doc)
-    title = re.sub(r'[ ]{2,}', ' ', title)
-    title = title.lower()
-    return (id, non_word_re.sub('', title))
+    ident, title = release_key_title(doc)
+    title = re.sub(r'[ ]{2,}', ' ', title).lower()
+    return (ident, non_word_re.sub('', title))
 
 
 def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
-    id, title = release_key_title(doc)
-    return (id, fuzzy.nysiis(title))
+    ident, title = release_key_title(doc)
+    return (ident, fuzzy.nysiis(title))
 
 
-def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
+def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
     """
     Derive a key from title and authors. Authors in contribs list:
 
@@ -90,10 +96,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
     ],
 
     Tokenize title, remote stopwords, lookup first three, lookup last three,
-    plus authors.
+    plus authors. TODO(miku): authors.
     """
-    # SS: compare ngram sets?
-
+    ident, title = get_ident_title(doc)
+    slug_title = slug_title(title)
+    tokens = slug_title.split()
+    if len(tokens) < 2 * n:
+        key = ''.join(tokens)
+    else:
+        key = ''.join(tokens[:3] + tokens[-3:])
+    return (ident, key)
 
 def sort_by_column(filename: str,
                    opts: str = "-k 2",
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index dfc0925..7d298e6 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -32,6 +32,7 @@ def run_cluster(args):
         'title': release_key_title,
         'tnorm': release_key_title_normalized,
         'tnysi': release_key_title_nysiis,
+        'tss': release_key_title_ngram,
     }
     cluster = Cluster(files=args.files,
                       keyfunc=types.get(args.type),
@@ -42,10 +43,16 @@ def run_cluster(args):
 
 
 def run_verify(args):
+    """
+    TODO.
+    """
     print('verify')
 
 
 def run_build(args):
+    """
+    Trying out.
+    """
     if args.type == "ss":
         builder = NgramLookup(files=args.files, output=args.output)
         builder.run()
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-10 17:49:24 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-10 17:49:24 +0100
commit	219fded46071a98434e28ca100c1a5cbc33fcba1 (patch)
tree	1be8ced5279ee7f79a697bf0178b35f0db53273a
parent	414ea00bdb51766a27375e6ca058e4178fb71b1b (diff)
download	fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.tar.gz fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.zip