aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-10 17:49:24 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-10 17:49:24 +0100
commit219fded46071a98434e28ca100c1a5cbc33fcba1 (patch)
tree1be8ced5279ee7f79a697bf0178b35f0db53273a /fuzzycat
parent414ea00bdb51766a27375e6ca058e4178fb71b1b (diff)
downloadfuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.tar.gz
fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.zip
add tss key option
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/build.py2
-rw-r--r--fuzzycat/cluster.py38
-rw-r--r--fuzzycat/main.py7
3 files changed, 33 insertions, 14 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
index 687c17e..e49e7d7 100644
--- a/fuzzycat/build.py
+++ b/fuzzycat/build.py
@@ -1,5 +1,5 @@
"""
-Build auxiliary data structures.
+WIP: Build auxiliary data structures for lookup.
"""
import fileinput
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index db20320..fa54ec6 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -52,32 +52,38 @@ class KeyDoc(BaseModel):
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
+printable_no_punct = string.digits + string.letters + string.whitespace
+
+def slugify_string(s: str) -> str:
+ """
+ Keeps ascii chars and single whitespace only.
+ """
+ return ''.join((c for c in s.lower() if c in printable_no_punct))
# Notes: untie from release_entity, as we are only using a few fields. Maybe
# it's a jsob blob, with a pydantic spec and schema.
def release_key_title(doc: KeyDoc) -> Tuple[str, str]:
- id, title = get_ident_title(doc)
+ ident, title = get_ident_title(doc)
if not title:
- raise ValueError('title missing')
+ raise ValueError('title missing for {}'.format(ident))
title = title.translate(ws_replacer).strip()
- return (id, title)
+ return (ident, title)
def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
- id, title = release_key_title(doc)
- title = re.sub(r'[ ]{2,}', ' ', title)
- title = title.lower()
- return (id, non_word_re.sub('', title))
+ ident, title = release_key_title(doc)
+ title = re.sub(r'[ ]{2,}', ' ', title).lower()
+ return (ident, non_word_re.sub('', title))
def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
- id, title = release_key_title(doc)
- return (id, fuzzy.nysiis(title))
+ ident, title = release_key_title(doc)
+ return (ident, fuzzy.nysiis(title))
-def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
+def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
"""
Derive a key from title and authors. Authors in contribs list:
@@ -90,10 +96,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
],
Tokenize title, remote stopwords, lookup first three, lookup last three,
- plus authors.
+ plus authors. TODO(miku): authors.
"""
- # SS: compare ngram sets?
-
+ ident, title = get_ident_title(doc)
+ slug_title = slug_title(title)
+ tokens = slug_title.split()
+ if len(tokens) < 2 * n:
+ key = ''.join(tokens)
+ else:
+ key = ''.join(tokens[:3] + tokens[-3:])
+ return (ident, key)
def sort_by_column(filename: str,
opts: str = "-k 2",
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index dfc0925..7d298e6 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -32,6 +32,7 @@ def run_cluster(args):
'title': release_key_title,
'tnorm': release_key_title_normalized,
'tnysi': release_key_title_nysiis,
+ 'tss': release_key_title_ngram,
}
cluster = Cluster(files=args.files,
keyfunc=types.get(args.type),
@@ -42,10 +43,16 @@ def run_cluster(args):
def run_verify(args):
+ """
+ TODO.
+ """
print('verify')
def run_build(args):
+ """
+ Trying out.
+ """
if args.type == "ss":
builder = NgramLookup(files=args.files, output=args.output)
builder.run()