add --verbose flag

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-03 17:44:28 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-03 17:44:28 +0100
commit: aad41200bb5a1679f435ea570d43259a3409353d (patch)
tree: 870d477f007c2f53e7117ece70f9ab2490ee0a87
parent: d0fadf51a74e7f1e9048bd0945b4046bc6fe0994 (diff)
download: fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.tar.gz
fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.zip
4 files changed, 41 insertions, 14 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 276f3b2..bbab024 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1,2 +1 @@
 __version__ = "0.1.4"
-
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 3282f3b..3d39a91 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -3,9 +3,15 @@ Clustering stage.
 """
 
 import functools
+import fileinput
 import operator
 import re
 import sys
+import tempfile
+import json
+import os
+import subprocess
+import itertools
 
 import fuzzy
 
@@ -21,6 +27,7 @@ get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile('[\W_]+', re.UNICODE)
 
+
 def release_key_title(re):
     id, title = get_ident_title(re)
     if not title:
@@ -28,14 +35,17 @@ def release_key_title(re):
     title = title.translate(ws_replacer).strip()
     return (id, title)
 
+
 def release_key_title_normalized(re):
     id, title = release_key_title(re)
     return (id, non_word_re.sub('', title))
 
+
 def release_key_title_nysiis(re):
     id, title = release_key_title(re)
     return (id, fuzzy.nysiis(title))
 
+
 def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
     """
     Sort tabular file with sort(1), returns the filename of the sorted file.
@@ -51,6 +61,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
 
     return tf.name
 
+
 def group_by(filename, key=None, value=None, comment=""):
     """
     Iterate over lines in filename, group by key (a callable deriving the key
@@ -65,23 +76,32 @@ def group_by(filename, key=None, value=None, comment=""):
             }
             yield doc
 
+
 def cut(f=0, sep='\t'):
     """
     Return a callable, that extracts a given column from a file with a specific
     separator. TODO: move this into more generic place.
     """
-    def f(value):
-        parts = value.split(sep)
-        if len(parts) > f + 1:
+    def func(value):
+        parts = value.strip().split(sep)
+        if len(parts) + 1 < f:
             raise ValueError('cannot split value into {} parts'.format(f))
         return parts[f]
-    return f
+
+    return func
+
 
 class Cluster:
     """
     Cluster scaffold for release entities.
     """
-    def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None):
+    def __init__(self,
+                 files="-",
+                 output=sys.stdout,
+                 keyfunc=lambda v: v,
+                 prefix='fuzzycat-',
+                 tmpdir=None,
+                 verbose=False):
         """
         Files can be a list of files or "-" for stdin.
         """
@@ -90,14 +110,17 @@ class Cluster:
         self.output = output
         self.prefix = prefix
         self.tmpdir = tmpdir
+        self.verbose = verbose
 
     def run(self):
         """
         Run clustering and write output to given stream or file.
         """
-        keyfunc = self.keyfunc # Save a lookup in loop.
+        keyfunc = self.keyfunc  # Save a lookup in loop.
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
-            for line in fileinput.input(files=files):
+            for i, line in enumerate(fileinput.input(files=self.files)):
+                if self.verbose and i % 100000 == 0:
+                    print("{}".format(i), file=sys.stderr)
                 try:
                     id, key = keyfunc(json.loads(line))
                     print("{}\t{}".format(id, key), file=tf)
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index e1f4236..2be07cb 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -4,8 +4,7 @@ import tempfile
 
 import elasticsearch
 
-from fuzzycat.cluster import (Cluster, release_key_title,
-                              release_key_title_normalized,
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
                               release_key_title_nysiis)
 
 
@@ -15,12 +14,18 @@ def run_cluster(args):
         'tnorm': release_key_title_normalized,
         'tnysi': release_key_title_nysiis,
     }
-    cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix)
+    cluster = Cluster(files=args.files,
+                      keyfunc=types.get(args.type),
+                      tmpdir=args.tmpdir,
+                      prefix=args.prefix,
+                      verbose=args.verbose)
     cluster.run()
 
+
 def run_verify(args):
     print('verify')
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(prog='fuzzycat',
                                      usage='%(prog)s command [options]',
@@ -28,6 +33,7 @@ if __name__ == '__main__':
 
     parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
     parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
+    parser.add_argument('--verbose', default=False, action='store_true', help='be verbose')
     subparsers = parser.add_subparsers()
 
     sub_cluster = subparsers.add_parser('cluster', help='group entities')
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index d4677b0..9f5eaa8 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -24,6 +24,7 @@ store, or some other cache
 
 """
 
+
 def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
     """
     Fetches a single release entity.
@@ -31,11 +32,9 @@ def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
     link = "https://api.fatcat.wiki/v0/release/{}".format(ident)
     return requests.get(link).json()
 
+
 def ident_to_release_entities(ids):
     """
     Turn a list of ids into release entities.
     """
     return [fetch_release_entity(id) for id in ids]
-
-
-
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-03 17:44:28 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-03 17:44:28 +0100
commit	aad41200bb5a1679f435ea570d43259a3409353d (patch)
tree	870d477f007c2f53e7117ece70f9ab2490ee0a87
parent	d0fadf51a74e7f1e9048bd0945b4046bc6fe0994 (diff)
download	fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.tar.gz fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.zip