diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 16:02:38 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 16:02:38 +0100 |
commit | a125f6d1354bb2e38e774c7e204d8a640555fca0 (patch) | |
tree | 5396f9f5b1ddc6c31979c1f0ad5bf52cd63a32bf /fuzzycat | |
parent | 242ad04f821294b27e1cbc85beed06099a764d5f (diff) | |
download | fuzzycat-a125f6d1354bb2e38e774c7e204d8a640555fca0.tar.gz fuzzycat-a125f6d1354bb2e38e774c7e204d8a640555fca0.zip |
add cluster tests
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/cluster.py | 8 | ||||
-rw-r--r-- | fuzzycat/main.py | 2 |
2 files changed, 6 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index a40db06..b2f739f 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -6,13 +6,13 @@ Clustering stage. import fileinput import itertools import json +import logging import operator import os import re import subprocess import sys import tempfile -import logging import fuzzy @@ -29,6 +29,7 @@ get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) + def release_key_title(release_entity): id, title = get_ident_title(release_entity) if not title: @@ -48,6 +49,7 @@ def release_key_title_nysiis(release_entity): id, title = release_key_title(release_entity) return (id, fuzzy.nysiis(title)) + def release_key_title_authors_ngram(release_entity): """ Derive a key from title and authors. @@ -55,7 +57,6 @@ def release_key_title_authors_ngram(release_entity): # SS: compare ngram sets? - def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None): """ Sort tabular file with sort(1), returns the filename of the sorted file. @@ -81,8 +82,9 @@ def group_by(seq, key=None, value=None, comment=""): doc = { "k": k.strip(), "v": [value(v) for v in g], - "c": comment, } + if comment: + doc["c"] = comment yield doc diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 44e0857..5f9efc3 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -11,9 +11,9 @@ Run, e.g. fuzzycat cluster --help for more options. Example: """ import argparse +import logging import sys import tempfile -import logging from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis) |