aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:51:43 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:51:43 +0100
commit9bf2d7772794045f47421e32cfd8a3aa43f4af0d (patch)
tree98fe53f0dbdbebd51ac5f5686f6c9501ba9ad98d
parent79ecfc3b2bde60a3b7a67cdbdda695785767b342 (diff)
downloadfuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.tar.gz
fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.zip
improve logging
-rw-r--r--fuzzycat/cluster.py13
-rw-r--r--fuzzycat/main.py11
2 files changed, 18 insertions, 6 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index b2f739f..995bfea 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -3,6 +3,7 @@
Clustering stage.
"""
+import collections
import fileinput
import itertools
import json
@@ -30,7 +31,7 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-def release_key_title(release_entity):
+def release_key_title(release_entity, get_ident_title=get_ident_title):
id, title = get_ident_title(release_entity)
if not title:
raise ValueError('title missing')
@@ -129,20 +130,26 @@ class Cluster:
Run clustering and write output to given stream or file.
"""
keyfunc = self.keyfunc # Save a lookup in loop.
+ counter = collections.Counter()
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in fileinput.input(files=self.files):
try:
id, key = keyfunc(json.loads(line))
print("{}\t{}".format(id, key), file=tf)
- except (KeyError, ValueError):
+ except (KeyError, ValueError) as exc:
+ counter["key_extraction_failed"] += 1
continue
- self.logger.debug("intermediate file at {}".format(tf.name))
+ else:
+ counter["key_ok"] += 1
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
with open(sbc) as f:
comment = keyfunc.__name__
for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment):
+ counter["groups"] += 1
json.dump(doc, self.output)
self.output.write("\n")
os.remove(sbc)
os.remove(tf.name)
+
+ return counter
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5f9efc3..dab8802 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -12,6 +12,7 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
import argparse
import logging
+import json
import sys
import tempfile
@@ -20,6 +21,7 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_norm
def run_cluster(args):
+ logger = logging.getLogger('main.run_cluster')
types = {
'title': release_key_title,
'tnorm': release_key_title_normalized,
@@ -29,15 +31,18 @@ def run_cluster(args):
keyfunc=types.get(args.type),
tmpdir=args.tmpdir,
prefix=args.prefix)
- cluster.run()
-
+ stats = cluster.run()
+ logger.debug(json.dumps(stats))
def run_verify(args):
print('verify')
if __name__ == '__main__':
- logging.basicConfig(level=logging.DEBUG)
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S')
parser = argparse.ArgumentParser(prog='fuzzycat',
description=__doc__,
usage='%(prog)s command [options]',