improve logging

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-05 16:51:43 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-05 16:51:43 +0100
commit: 9bf2d7772794045f47421e32cfd8a3aa43f4af0d (patch)
tree: 98fe53f0dbdbebd51ac5f5686f6c9501ba9ad98d /fuzzycat/cluster.py
parent: 79ecfc3b2bde60a3b7a67cdbdda695785767b342 (diff)
download: fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.tar.gz
fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.zip
1 files changed, 10 insertions, 3 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index b2f739f..995bfea 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -3,6 +3,7 @@
 Clustering stage.
 """
 
+import collections
 import fileinput
 import itertools
 import json
@@ -30,7 +31,7 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
 
 
-def release_key_title(release_entity):
+def release_key_title(release_entity, get_ident_title=get_ident_title):
     id, title = get_ident_title(release_entity)
     if not title:
         raise ValueError('title missing')
@@ -129,20 +130,26 @@ class Cluster:
         Run clustering and write output to given stream or file.
         """
         keyfunc = self.keyfunc  # Save a lookup in loop.
+        counter = collections.Counter()
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
             for line in fileinput.input(files=self.files):
                 try:
                     id, key = keyfunc(json.loads(line))
                     print("{}\t{}".format(id, key), file=tf)
-                except (KeyError, ValueError):
+                except (KeyError, ValueError) as exc:
+                    counter["key_extraction_failed"] += 1
                     continue
-        self.logger.debug("intermediate file at {}".format(tf.name))
+                else:
+                    counter["key_ok"] += 1
         sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
         with open(sbc) as f:
             comment = keyfunc.__name__
             for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment):
+                counter["groups"] += 1
                 json.dump(doc, self.output)
                 self.output.write("\n")
 
         os.remove(sbc)
         os.remove(tf.name)
+
+        return counter
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-05 16:51:43 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-05 16:51:43 +0100
commit	9bf2d7772794045f47421e32cfd8a3aa43f4af0d (patch)
tree	98fe53f0dbdbebd51ac5f5686f6c9501ba9ad98d /fuzzycat/cluster.py
parent	79ecfc3b2bde60a3b7a67cdbdda695785767b342 (diff)
download	fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.tar.gz fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.zip