diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-03 17:44:28 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-03 17:44:28 +0100 |
commit | aad41200bb5a1679f435ea570d43259a3409353d (patch) | |
tree | 870d477f007c2f53e7117ece70f9ab2490ee0a87 | |
parent | d0fadf51a74e7f1e9048bd0945b4046bc6fe0994 (diff) | |
download | fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.tar.gz fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.zip |
add --verbose flag
-rw-r--r-- | fuzzycat/__init__.py | 1 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 37 | ||||
-rw-r--r-- | fuzzycat/main.py | 12 | ||||
-rw-r--r-- | fuzzycat/verify.py | 5 |
4 files changed, 41 insertions, 14 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py index 276f3b2..bbab024 100644 --- a/fuzzycat/__init__.py +++ b/fuzzycat/__init__.py @@ -1,2 +1 @@ __version__ = "0.1.4" - diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 3282f3b..3d39a91 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -3,9 +3,15 @@ Clustering stage. """ import functools +import fileinput import operator import re import sys +import tempfile +import json +import os +import subprocess +import itertools import fuzzy @@ -21,6 +27,7 @@ get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile('[\W_]+', re.UNICODE) + def release_key_title(re): id, title = get_ident_title(re) if not title: @@ -28,14 +35,17 @@ def release_key_title(re): title = title.translate(ws_replacer).strip() return (id, title) + def release_key_title_normalized(re): id, title = release_key_title(re) return (id, non_word_re.sub('', title)) + def release_key_title_nysiis(re): id, title = release_key_title(re) return (id, fuzzy.nysiis(title)) + def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None): """ Sort tabular file with sort(1), returns the filename of the sorted file. @@ -51,6 +61,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat- return tf.name + def group_by(filename, key=None, value=None, comment=""): """ Iterate over lines in filename, group by key (a callable deriving the key @@ -65,23 +76,32 @@ def group_by(filename, key=None, value=None, comment=""): } yield doc + def cut(f=0, sep='\t'): """ Return a callable, that extracts a given column from a file with a specific separator. TODO: move this into more generic place. """ - def f(value): - parts = value.split(sep) - if len(parts) > f + 1: + def func(value): + parts = value.strip().split(sep) + if len(parts) + 1 < f: raise ValueError('cannot split value into {} parts'.format(f)) return parts[f] - return f + + return func + class Cluster: """ Cluster scaffold for release entities. """ - def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None): + def __init__(self, + files="-", + output=sys.stdout, + keyfunc=lambda v: v, + prefix='fuzzycat-', + tmpdir=None, + verbose=False): """ Files can be a list of files or "-" for stdin. """ @@ -90,14 +110,17 @@ class Cluster: self.output = output self.prefix = prefix self.tmpdir = tmpdir + self.verbose = verbose def run(self): """ Run clustering and write output to given stream or file. """ - keyfunc = self.keyfunc # Save a lookup in loop. + keyfunc = self.keyfunc # Save a lookup in loop. with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: - for line in fileinput.input(files=files): + for i, line in enumerate(fileinput.input(files=self.files)): + if self.verbose and i % 100000 == 0: + print("{}".format(i), file=sys.stderr) try: id, key = keyfunc(json.loads(line)) print("{}\t{}".format(id, key), file=tf) diff --git a/fuzzycat/main.py b/fuzzycat/main.py index e1f4236..2be07cb 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -4,8 +4,7 @@ import tempfile import elasticsearch -from fuzzycat.cluster import (Cluster, release_key_title, - release_key_title_normalized, +from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis) @@ -15,12 +14,18 @@ def run_cluster(args): 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, } - cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix) + cluster = Cluster(files=args.files, + keyfunc=types.get(args.type), + tmpdir=args.tmpdir, + prefix=args.prefix, + verbose=args.verbose) cluster.run() + def run_verify(args): print('verify') + if __name__ == '__main__': parser = argparse.ArgumentParser(prog='fuzzycat', usage='%(prog)s command [options]', @@ -28,6 +33,7 @@ if __name__ == '__main__': parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix') parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory') + parser.add_argument('--verbose', default=False, action='store_true', help='be verbose') subparsers = parser.add_subparsers() sub_cluster = subparsers.add_parser('cluster', help='group entities') diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index d4677b0..9f5eaa8 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -24,6 +24,7 @@ store, or some other cache """ + def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"): """ Fetches a single release entity. @@ -31,11 +32,9 @@ def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"): link = "https://api.fatcat.wiki/v0/release/{}".format(ident) return requests.get(link).json() + def ident_to_release_entities(ids): """ Turn a list of ids into release entities. """ return [fetch_release_entity(id) for id in ids] - - - |