diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 16:51:43 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-05 16:51:43 +0100 |
commit | 9bf2d7772794045f47421e32cfd8a3aa43f4af0d (patch) | |
tree | 98fe53f0dbdbebd51ac5f5686f6c9501ba9ad98d | |
parent | 79ecfc3b2bde60a3b7a67cdbdda695785767b342 (diff) | |
download | fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.tar.gz fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.zip |
improve logging
-rw-r--r-- | fuzzycat/cluster.py | 13 | ||||
-rw-r--r-- | fuzzycat/main.py | 11 |
2 files changed, 18 insertions, 6 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index b2f739f..995bfea 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -3,6 +3,7 @@ Clustering stage. """ +import collections import fileinput import itertools import json @@ -30,7 +31,7 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) -def release_key_title(release_entity): +def release_key_title(release_entity, get_ident_title=get_ident_title): id, title = get_ident_title(release_entity) if not title: raise ValueError('title missing') @@ -129,20 +130,26 @@ class Cluster: Run clustering and write output to given stream or file. """ keyfunc = self.keyfunc # Save a lookup in loop. + counter = collections.Counter() with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for line in fileinput.input(files=self.files): try: id, key = keyfunc(json.loads(line)) print("{}\t{}".format(id, key), file=tf) - except (KeyError, ValueError): + except (KeyError, ValueError) as exc: + counter["key_extraction_failed"] += 1 continue - self.logger.debug("intermediate file at {}".format(tf.name)) + else: + counter["key_ok"] += 1 sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir) with open(sbc) as f: comment = keyfunc.__name__ for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment): + counter["groups"] += 1 json.dump(doc, self.output) self.output.write("\n") os.remove(sbc) os.remove(tf.name) + + return counter diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 5f9efc3..dab8802 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -12,6 +12,7 @@ Run, e.g. fuzzycat cluster --help for more options. Example: import argparse import logging +import json import sys import tempfile @@ -20,6 +21,7 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_norm def run_cluster(args): + logger = logging.getLogger('main.run_cluster') types = { 'title': release_key_title, 'tnorm': release_key_title_normalized, @@ -29,15 +31,18 @@ def run_cluster(args): keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix) - cluster.run() - + stats = cluster.run() + logger.debug(json.dumps(stats)) def run_verify(args): print('verify') if __name__ == '__main__': - logging.basicConfig(level=logging.DEBUG) + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') parser = argparse.ArgumentParser(prog='fuzzycat', description=__doc__, usage='%(prog)s command [options]', |