aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/cluster.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:51:43 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-05 16:51:43 +0100
commit9bf2d7772794045f47421e32cfd8a3aa43f4af0d (patch)
tree98fe53f0dbdbebd51ac5f5686f6c9501ba9ad98d /fuzzycat/cluster.py
parent79ecfc3b2bde60a3b7a67cdbdda695785767b342 (diff)
downloadfuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.tar.gz
fuzzycat-9bf2d7772794045f47421e32cfd8a3aa43f4af0d.zip
improve logging
Diffstat (limited to 'fuzzycat/cluster.py')
-rw-r--r--fuzzycat/cluster.py13
1 files changed, 10 insertions, 3 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index b2f739f..995bfea 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -3,6 +3,7 @@
Clustering stage.
"""
+import collections
import fileinput
import itertools
import json
@@ -30,7 +31,7 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-def release_key_title(release_entity):
+def release_key_title(release_entity, get_ident_title=get_ident_title):
id, title = get_ident_title(release_entity)
if not title:
raise ValueError('title missing')
@@ -129,20 +130,26 @@ class Cluster:
Run clustering and write output to given stream or file.
"""
keyfunc = self.keyfunc # Save a lookup in loop.
+ counter = collections.Counter()
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in fileinput.input(files=self.files):
try:
id, key = keyfunc(json.loads(line))
print("{}\t{}".format(id, key), file=tf)
- except (KeyError, ValueError):
+ except (KeyError, ValueError) as exc:
+ counter["key_extraction_failed"] += 1
continue
- self.logger.debug("intermediate file at {}".format(tf.name))
+ else:
+ counter["key_ok"] += 1
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
with open(sbc) as f:
comment = keyfunc.__name__
for doc in group_by(f, key=cut(f=1), value=cut(f=0), comment=comment):
+ counter["groups"] += 1
json.dump(doc, self.output)
self.output.write("\n")
os.remove(sbc)
os.remove(tf.name)
+
+ return counter