aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:02:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:02:38 +0100
commita125f6d1354bb2e38e774c7e204d8a640555fca0 (patch)
tree5396f9f5b1ddc6c31979c1f0ad5bf52cd63a32bf /fuzzycat
parent242ad04f821294b27e1cbc85beed06099a764d5f (diff)
downloadfuzzycat-a125f6d1354bb2e38e774c7e204d8a640555fca0.tar.gz
fuzzycat-a125f6d1354bb2e38e774c7e204d8a640555fca0.zip
add cluster tests
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py8
-rw-r--r--fuzzycat/main.py2
2 files changed, 6 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index a40db06..b2f739f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -6,13 +6,13 @@ Clustering stage.
import fileinput
import itertools
import json
+import logging
import operator
import os
import re
import subprocess
import sys
import tempfile
-import logging
import fuzzy
@@ -29,6 +29,7 @@ get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
+
def release_key_title(release_entity):
id, title = get_ident_title(release_entity)
if not title:
@@ -48,6 +49,7 @@ def release_key_title_nysiis(release_entity):
id, title = release_key_title(release_entity)
return (id, fuzzy.nysiis(title))
+
def release_key_title_authors_ngram(release_entity):
"""
Derive a key from title and authors.
@@ -55,7 +57,6 @@ def release_key_title_authors_ngram(release_entity):
# SS: compare ngram sets?
-
def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
@@ -81,8 +82,9 @@ def group_by(seq, key=None, value=None, comment=""):
doc = {
"k": k.strip(),
"v": [value(v) for v in g],
- "c": comment,
}
+ if comment:
+ doc["c"] = comment
yield doc
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 44e0857..5f9efc3 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -11,9 +11,9 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
"""
import argparse
+import logging
import sys
import tempfile
-import logging
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)