aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 01:23:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 01:23:38 +0100
commit97479edb44033b7ff9cb09bde500e49c5bf49e68 (patch)
tree3c0b3b79d5af929a4fd943b45415d70854b8494f /fuzzycat
parent321beac2b8b724532103ccc872becda33f33cd77 (diff)
downloadfuzzycat-97479edb44033b7ff9cb09bde500e49c5bf49e68.tar.gz
fuzzycat-97479edb44033b7ff9cb09bde500e49c5bf49e68.zip
add tests
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py55
-rw-r--r--fuzzycat/main.py2
2 files changed, 34 insertions, 23 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index ee19611..6058b37 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -12,6 +12,7 @@ import re
import subprocess
import sys
import tempfile
+import logging
import fuzzy
@@ -21,30 +22,39 @@ __all__ = [
"release_key_title_nysiis",
"sort_by_column",
"group_by",
+ "Cluster",
]
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-
-def release_key_title(re):
- id, title = get_ident_title(re)
+def release_key_title(release_entity):
+ id, title = get_ident_title(release_entity)
if not title:
raise ValueError('title missing')
title = title.translate(ws_replacer).strip()
return (id, title)
-def release_key_title_normalized(re):
- id, title = release_key_title(re)
+def release_key_title_normalized(release_entity):
+ id, title = release_key_title(release_entity)
+ title = re.sub(r'[ ]{2,}', ' ', title)
+ title = title.lower()
return (id, non_word_re.sub('', title))
-def release_key_title_nysiis(re):
- id, title = release_key_title(re)
+def release_key_title_nysiis(release_entity):
+ id, title = release_key_title(release_entity)
return (id, fuzzy.nysiis(title))
+def release_key_title_authors_ngram(release_entity):
+ """
+ Derive a key from title and authors.
+ """
+ # SS: compare ngram sets?
+
+
def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
"""
@@ -62,19 +72,18 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
return tf.name
-def group_by(filename, key=None, value=None, comment=""):
+def group_by(seq, key=None, value=None, comment=""):
"""
Iterate over lines in filename, group by key (a callable deriving the key
from the line), then apply value callable to emit a minimal document.
"""
- with open(filename) as f:
- for k, g in itertools.groupby(f, key=key):
- doc = {
- "k": k.strip(),
- "v": [value(v) for v in g],
- "c": comment,
- }
- yield doc
+ for k, g in itertools.groupby(seq, key=key):
+ doc = {
+ "k": k.strip(),
+ "v": [value(v) for v in g],
+ "c": comment,
+ }
+ yield doc
def cut(f=0, sep='\t', ignore_missing_column=True):
@@ -87,8 +96,7 @@ def cut(f=0, sep='\t', ignore_missing_column=True):
if f >= len(parts):
if ignore_missing_column:
return ""
- else:
- raise ValueError('cannot split value {} into {} parts'.format(value, f))
+ raise ValueError('cannot split value {} into {} parts'.format(value, f))
return parts[f]
return func
@@ -113,7 +121,7 @@ class Cluster:
self.output = output
self.prefix = prefix
self.tmpdir = tmpdir
- self.verbose = verbose
+ self.logger = logging.getLogger('fuzzycat.cluster')
def run(self):
"""
@@ -129,11 +137,12 @@ class Cluster:
print("{}\t{}".format(id, key), file=tf)
except (KeyError, ValueError):
continue
- if self.verbose:
- print(tf.name, file=sys.stderr)
+ self.logger.debug("intermediate file at {}".format(tf.name))
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
- for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__):
- json.dump(doc, self.output)
+ with open(sbc) as f:
+ comment = keyfunc.__name__
+ for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment):
+ json.dump(doc, self.output)
os.remove(sbc)
os.remove(tf.name)
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5eaa4a2..7f47181 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -13,6 +13,7 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
import argparse
import sys
import tempfile
+import logging
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -37,6 +38,7 @@ def run_verify(args):
if __name__ == '__main__':
+ logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog='fuzzycat',
description=__doc__,
usage='%(prog)s command [options]',