diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | Pipfile | 1 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 55 | ||||
-rw-r--r-- | fuzzycat/main.py | 2 | ||||
-rw-r--r-- | tests/test_cluster.py | 47 |
5 files changed, 86 insertions, 23 deletions
@@ -29,6 +29,10 @@ dist: ## Create source distribution and wheel cov: ## Run coverage report pytest --cov=fuzzycat tests/ +.PHONY: test +test: ## Run coverage report + pytest -v tests/ + .PHONY: lint lint: $(PY_FILES) pylint fuzzycat @@ -9,6 +9,7 @@ yapf = "*" ipython = "*" twine = "*" fuzzycat = {editable = true, path = "."} +pytest = "*" [packages] elasticsearch = "*" diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index ee19611..6058b37 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -12,6 +12,7 @@ import re import subprocess import sys import tempfile +import logging import fuzzy @@ -21,30 +22,39 @@ __all__ = [ "release_key_title_nysiis", "sort_by_column", "group_by", + "Cluster", ] get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) - -def release_key_title(re): - id, title = get_ident_title(re) +def release_key_title(release_entity): + id, title = get_ident_title(release_entity) if not title: raise ValueError('title missing') title = title.translate(ws_replacer).strip() return (id, title) -def release_key_title_normalized(re): - id, title = release_key_title(re) +def release_key_title_normalized(release_entity): + id, title = release_key_title(release_entity) + title = re.sub(r'[ ]{2,}', ' ', title) + title = title.lower() return (id, non_word_re.sub('', title)) -def release_key_title_nysiis(re): - id, title = release_key_title(re) +def release_key_title_nysiis(release_entity): + id, title = release_key_title(release_entity) return (id, fuzzy.nysiis(title)) +def release_key_title_authors_ngram(release_entity): + """ + Derive a key from title and authors. + """ + # SS: compare ngram sets? + + def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None): """ @@ -62,19 +72,18 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat- return tf.name -def group_by(filename, key=None, value=None, comment=""): +def group_by(seq, key=None, value=None, comment=""): """ Iterate over lines in filename, group by key (a callable deriving the key from the line), then apply value callable to emit a minimal document. """ - with open(filename) as f: - for k, g in itertools.groupby(f, key=key): - doc = { - "k": k.strip(), - "v": [value(v) for v in g], - "c": comment, - } - yield doc + for k, g in itertools.groupby(seq, key=key): + doc = { + "k": k.strip(), + "v": [value(v) for v in g], + "c": comment, + } + yield doc def cut(f=0, sep='\t', ignore_missing_column=True): @@ -87,8 +96,7 @@ def cut(f=0, sep='\t', ignore_missing_column=True): if f >= len(parts): if ignore_missing_column: return "" - else: - raise ValueError('cannot split value {} into {} parts'.format(value, f)) + raise ValueError('cannot split value {} into {} parts'.format(value, f)) return parts[f] return func @@ -113,7 +121,7 @@ class Cluster: self.output = output self.prefix = prefix self.tmpdir = tmpdir - self.verbose = verbose + self.logger = logging.getLogger('fuzzycat.cluster') def run(self): """ @@ -129,11 +137,12 @@ class Cluster: print("{}\t{}".format(id, key), file=tf) except (KeyError, ValueError): continue - if self.verbose: - print(tf.name, file=sys.stderr) + self.logger.debug("intermediate file at {}".format(tf.name)) sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir) - for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__): - json.dump(doc, self.output) + with open(sbc) as f: + comment = keyfunc.__name__ + for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment): + json.dump(doc, self.output) os.remove(sbc) os.remove(tf.name) diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 5eaa4a2..7f47181 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -13,6 +13,7 @@ Run, e.g. fuzzycat cluster --help for more options. Example: import argparse import sys import tempfile +import logging from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis) @@ -37,6 +38,7 @@ def run_verify(args): if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser(prog='fuzzycat', description=__doc__, usage='%(prog)s command [options]', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 8b13789..aa015a6 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1 +1,48 @@ +import pytest +import collections +from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis +Case = collections.namedtuple("Case", 'input output') + +def test_release_key_title(): + with pytest.raises(KeyError): + release_key_title({}) + with pytest.raises(KeyError, match='title'): + release_key_title({'ident': '123'}) + with pytest.raises(KeyError, match='ident'): + release_key_title({'title': 'deep learning backdoor'}) + with pytest.raises(ValueError, match='title.*missing'): + release_key_title({'ident': '', 'title': ''}) + cases = ( + Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')), + Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'Simhash')), + Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'Sim hash')), + ) + for case in cases: + assert case.output == release_key_title(case.input) + + +def test_release_key_title_normalized(): + cases = ( + Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')), + Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'simhash')), + Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'simhash')), + Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'theyear1929')), + Case(input={'ident': '', 'title': '2019?'}, output=('', '2019')), + Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'h2019')), + ) + for case in cases: + assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(case.input) + +def test_release_key_title_nysiis(): + cases = ( + Case(input={'ident': '', 'title': 'simhash'}, output=('', 'SANAS')), + Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'SANAS')), + Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'SANAS')), + Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'TAR')), + Case(input={'ident': '', 'title': '2019?'}, output=('', '')), + Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'H')), + Case(input={'ident': '123', 'title': '世界'}, output=('123', '')), + ) + for case in cases: + assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(case.input) |