diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-10 17:49:24 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-10 17:49:24 +0100 |
commit | 219fded46071a98434e28ca100c1a5cbc33fcba1 (patch) | |
tree | 1be8ced5279ee7f79a697bf0178b35f0db53273a | |
parent | 414ea00bdb51766a27375e6ca058e4178fb71b1b (diff) | |
download | fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.tar.gz fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.zip |
add tss key option
-rw-r--r-- | fuzzycat/build.py | 2 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 38 | ||||
-rw-r--r-- | fuzzycat/main.py | 7 |
3 files changed, 33 insertions, 14 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py index 687c17e..e49e7d7 100644 --- a/fuzzycat/build.py +++ b/fuzzycat/build.py @@ -1,5 +1,5 @@ """ -Build auxiliary data structures. +WIP: Build auxiliary data structures for lookup. """ import fileinput diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index db20320..fa54ec6 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -52,32 +52,38 @@ class KeyDoc(BaseModel): get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) +printable_no_punct = string.digits + string.letters + string.whitespace + +def slugify_string(s: str) -> str: + """ + Keeps ascii chars and single whitespace only. + """ + return ''.join((c for c in s.lower() if c in printable_no_punct)) # Notes: untie from release_entity, as we are only using a few fields. Maybe # it's a jsob blob, with a pydantic spec and schema. def release_key_title(doc: KeyDoc) -> Tuple[str, str]: - id, title = get_ident_title(doc) + ident, title = get_ident_title(doc) if not title: - raise ValueError('title missing') + raise ValueError('title missing for {}'.format(ident)) title = title.translate(ws_replacer).strip() - return (id, title) + return (ident, title) def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]: - id, title = release_key_title(doc) - title = re.sub(r'[ ]{2,}', ' ', title) - title = title.lower() - return (id, non_word_re.sub('', title)) + ident, title = release_key_title(doc) + title = re.sub(r'[ ]{2,}', ' ', title).lower() + return (ident, non_word_re.sub('', title)) def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]: - id, title = release_key_title(doc) - return (id, fuzzy.nysiis(title)) + ident, title = release_key_title(doc) + return (ident, fuzzy.nysiis(title)) -def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]: +def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: """ Derive a key from title and authors. Authors in contribs list: @@ -90,10 +96,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]: ], Tokenize title, remote stopwords, lookup first three, lookup last three, - plus authors. + plus authors. TODO(miku): authors. """ - # SS: compare ngram sets? - + ident, title = get_ident_title(doc) + slug_title = slug_title(title) + tokens = slug_title.split() + if len(tokens) < 2 * n: + key = ''.join(tokens) + else: + key = ''.join(tokens[:3] + tokens[-3:]) + return (ident, key) def sort_by_column(filename: str, opts: str = "-k 2", diff --git a/fuzzycat/main.py b/fuzzycat/main.py index dfc0925..7d298e6 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -32,6 +32,7 @@ def run_cluster(args): 'title': release_key_title, 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, + 'tss': release_key_title_ngram, } cluster = Cluster(files=args.files, keyfunc=types.get(args.type), @@ -42,10 +43,16 @@ def run_cluster(args): def run_verify(args): + """ + TODO. + """ print('verify') def run_build(args): + """ + Trying out. + """ if args.type == "ss": builder = NgramLookup(files=args.files, output=args.output) builder.run() |