aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-02 20:02:14 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-02 20:02:14 +0100
commit9982d7235b8adb39857a0a270f0b3d7dc7c03bbd (patch)
tree75e3d3592241308f3336b763619558876b0194a8
parent0e243a4cb43ff1da084cf2c82836d8e403ef1c4e (diff)
downloadfuzzycat-9982d7235b8adb39857a0a270f0b3d7dc7c03bbd.tar.gz
fuzzycat-9982d7235b8adb39857a0a270f0b3d7dc7c03bbd.zip
cluster: cleanup class based approach
-rw-r--r--fuzzycat/cluster.py53
-rw-r--r--fuzzycat/main.py22
2 files changed, 52 insertions, 23 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index fbc7712..3282f3b 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -14,18 +14,13 @@ __all__ = [
"release_key_title_normalized",
"release_key_title_nysiis",
"sort_file_by_column",
+ "group_by",
]
get_ident_title = operator.itemgetter("ident", "title")
-ws_replacer = str.maketrans("\t", " ", "\n", " ")
+ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile('[\W_]+', re.UNICODE)
-def cut(value, f=0, sep='\t'):
- """
- Split value by separator and return a single column.
- """
- return value.split(sep)[f]
-
def release_key_title(re):
id, title = get_ident_title(re)
if not title:
@@ -41,13 +36,15 @@ def release_key_title_nysiis(re):
id, title = release_key_title(re)
return (id, fuzzy.nysiis(title))
-def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-"):
+def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
TODO: use separate /fast/tmp for sort.
"""
with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf:
env = os.environ.copy()
+ if tmpdir is not None:
+ env["TMPDIR"] = tmpdir
if fast:
env["LC_ALL"] = "C"
subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env)
@@ -55,6 +52,10 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
return tf.name
def group_by(filename, key=None, value=None, comment=""):
+ """
+ Iterate over lines in filename, group by key (a callable deriving the key
+ from the line), then apply value callable to emit a minimal document.
+ """
with open(filename) as f:
for k, g in itertools.groupby(f, key=key):
doc = {
@@ -64,29 +65,45 @@ def group_by(filename, key=None, value=None, comment=""):
}
yield doc
+def cut(f=0, sep='\t'):
+ """
+ Return a callable, that extracts a given column from a file with a specific
+ separator. TODO: move this into more generic place.
+ """
+ def f(value):
+ parts = value.split(sep)
+ if len(parts) > f + 1:
+ raise ValueError('cannot split value into {} parts'.format(f))
+ return parts[f]
+ return f
+
class Cluster:
"""
Cluster scaffold for release entities.
"""
- def __init__(self, files=None, output=None, keyfunc=lambda v: v, tmp_prefix='fuzzycat-'):
+ def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None):
+ """
+ Files can be a list of files or "-" for stdin.
+ """
self.files = files
- self.tmp_prefix = tmp_prefix
self.keyfunc = keyfunc
self.output = output
- if self.output is None:
- self.output = sys.stdout
+ self.prefix = prefix
+ self.tmpdir = tmpdir
def run(self):
- with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.tmp_prefix) as tf:
+ """
+ Run clustering and write output to given stream or file.
+ """
+ keyfunc = self.keyfunc # Save a lookup in loop.
+ with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in fileinput.input(files=files):
try:
- id, key = self.keyfunc(json.loads(line))
+ id, key = keyfunc(json.loads(line))
+ print("{}\t{}".format(id, key), file=tf)
except (KeyError, ValueError):
continue
- else:
- print("{}\t{}".format(id, key), file=tf)
-
- sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.tmp_prefix)
+ sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__):
json.dump(doc, self.output)
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 8c566d2..71f6f17 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -1,10 +1,22 @@
import argparse
-import elasticsearch
-import tempfile
import sys
+import tempfile
+
+import elasticsearch
+
+from fuzzycat.cluster import (Cluster, release_key_title,
+ release_key_title_normalized,
+ release_key_title_nysiis)
+
def run_cluster(args):
- print('cluster')
+ types = {
+ 'title': release_key_title,
+ 'tnorm': release_key_title_normalized,
+ 'tnysi': release_key_title_nysiis,
+ }
+ cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix)
+
def run_verify(args):
print('verify')
@@ -14,12 +26,13 @@ if __name__ == '__main__':
usage='%(prog)s command [options]',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--tmp-prefix', default='fuzzycat-', help='temp file prefix')
+ parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
subparsers = parser.add_subparsers()
sub_cluster = subparsers.add_parser('cluster', help='group entities')
sub_cluster.set_defaults(func=run_cluster)
+ sub_cluster.add_argument('-f', '--files', default="-", help='output files')
sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm')
sub_verify = subparsers.add_parser('verify', help='verify groups')
@@ -31,4 +44,3 @@ if __name__ == '__main__':
sys.exit(1)
args.func(args)
-