aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile4
-rw-r--r--Pipfile1
-rw-r--r--fuzzycat/cluster.py55
-rw-r--r--fuzzycat/main.py2
-rw-r--r--tests/test_cluster.py47
5 files changed, 86 insertions, 23 deletions
diff --git a/Makefile b/Makefile
index 80cf6a2..9518e56 100644
--- a/Makefile
+++ b/Makefile
@@ -29,6 +29,10 @@ dist: ## Create source distribution and wheel
cov: ## Run coverage report
pytest --cov=fuzzycat tests/
+.PHONY: test
+test: ## Run coverage report
+ pytest -v tests/
+
.PHONY: lint
lint: $(PY_FILES)
pylint fuzzycat
diff --git a/Pipfile b/Pipfile
index c37d6fb..99d6a6b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -9,6 +9,7 @@ yapf = "*"
ipython = "*"
twine = "*"
fuzzycat = {editable = true, path = "."}
+pytest = "*"
[packages]
elasticsearch = "*"
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index ee19611..6058b37 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -12,6 +12,7 @@ import re
import subprocess
import sys
import tempfile
+import logging
import fuzzy
@@ -21,30 +22,39 @@ __all__ = [
"release_key_title_nysiis",
"sort_by_column",
"group_by",
+ "Cluster",
]
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-
-def release_key_title(re):
- id, title = get_ident_title(re)
+def release_key_title(release_entity):
+ id, title = get_ident_title(release_entity)
if not title:
raise ValueError('title missing')
title = title.translate(ws_replacer).strip()
return (id, title)
-def release_key_title_normalized(re):
- id, title = release_key_title(re)
+def release_key_title_normalized(release_entity):
+ id, title = release_key_title(release_entity)
+ title = re.sub(r'[ ]{2,}', ' ', title)
+ title = title.lower()
return (id, non_word_re.sub('', title))
-def release_key_title_nysiis(re):
- id, title = release_key_title(re)
+def release_key_title_nysiis(release_entity):
+ id, title = release_key_title(release_entity)
return (id, fuzzy.nysiis(title))
+def release_key_title_authors_ngram(release_entity):
+ """
+ Derive a key from title and authors.
+ """
+ # SS: compare ngram sets?
+
+
def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
"""
@@ -62,19 +72,18 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
return tf.name
-def group_by(filename, key=None, value=None, comment=""):
+def group_by(seq, key=None, value=None, comment=""):
"""
Iterate over lines in filename, group by key (a callable deriving the key
from the line), then apply value callable to emit a minimal document.
"""
- with open(filename) as f:
- for k, g in itertools.groupby(f, key=key):
- doc = {
- "k": k.strip(),
- "v": [value(v) for v in g],
- "c": comment,
- }
- yield doc
+ for k, g in itertools.groupby(seq, key=key):
+ doc = {
+ "k": k.strip(),
+ "v": [value(v) for v in g],
+ "c": comment,
+ }
+ yield doc
def cut(f=0, sep='\t', ignore_missing_column=True):
@@ -87,8 +96,7 @@ def cut(f=0, sep='\t', ignore_missing_column=True):
if f >= len(parts):
if ignore_missing_column:
return ""
- else:
- raise ValueError('cannot split value {} into {} parts'.format(value, f))
+ raise ValueError('cannot split value {} into {} parts'.format(value, f))
return parts[f]
return func
@@ -113,7 +121,7 @@ class Cluster:
self.output = output
self.prefix = prefix
self.tmpdir = tmpdir
- self.verbose = verbose
+ self.logger = logging.getLogger('fuzzycat.cluster')
def run(self):
"""
@@ -129,11 +137,12 @@ class Cluster:
print("{}\t{}".format(id, key), file=tf)
except (KeyError, ValueError):
continue
- if self.verbose:
- print(tf.name, file=sys.stderr)
+ self.logger.debug("intermediate file at {}".format(tf.name))
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
- for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__):
- json.dump(doc, self.output)
+ with open(sbc) as f:
+ comment = keyfunc.__name__
+ for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment):
+ json.dump(doc, self.output)
os.remove(sbc)
os.remove(tf.name)
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5eaa4a2..7f47181 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -13,6 +13,7 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
import argparse
import sys
import tempfile
+import logging
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -37,6 +38,7 @@ def run_verify(args):
if __name__ == '__main__':
+ logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(prog='fuzzycat',
description=__doc__,
usage='%(prog)s command [options]',
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 8b13789..aa015a6 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1 +1,48 @@
+import pytest
+import collections
+from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis
+Case = collections.namedtuple("Case", 'input output')
+
+def test_release_key_title():
+ with pytest.raises(KeyError):
+ release_key_title({})
+ with pytest.raises(KeyError, match='title'):
+ release_key_title({'ident': '123'})
+ with pytest.raises(KeyError, match='ident'):
+ release_key_title({'title': 'deep learning backdoor'})
+ with pytest.raises(ValueError, match='title.*missing'):
+ release_key_title({'ident': '', 'title': ''})
+ cases = (
+ Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')),
+ Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'Simhash')),
+ Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'Sim hash')),
+ )
+ for case in cases:
+ assert case.output == release_key_title(case.input)
+
+
+def test_release_key_title_normalized():
+ cases = (
+ Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')),
+ Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'simhash')),
+ Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'simhash')),
+ Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'theyear1929')),
+ Case(input={'ident': '', 'title': '2019?'}, output=('', '2019')),
+ Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'h2019')),
+ )
+ for case in cases:
+ assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(case.input)
+
+def test_release_key_title_nysiis():
+ cases = (
+ Case(input={'ident': '', 'title': 'simhash'}, output=('', 'SANAS')),
+ Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'SANAS')),
+ Case(input={'ident': '', 'title': 'Sim hash'}, output=('', 'SANAS')),
+ Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'TAR')),
+ Case(input={'ident': '', 'title': '2019?'}, output=('', '')),
+ Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'H')),
+ Case(input={'ident': '123', 'title': '世界'}, output=('123', '')),
+ )
+ for case in cases:
+ assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(case.input)