aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-03 17:44:28 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-03 17:44:28 +0100
commitaad41200bb5a1679f435ea570d43259a3409353d (patch)
tree870d477f007c2f53e7117ece70f9ab2490ee0a87
parentd0fadf51a74e7f1e9048bd0945b4046bc6fe0994 (diff)
downloadfuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.tar.gz
fuzzycat-aad41200bb5a1679f435ea570d43259a3409353d.zip
add --verbose flag
-rw-r--r--fuzzycat/__init__.py1
-rw-r--r--fuzzycat/cluster.py37
-rw-r--r--fuzzycat/main.py12
-rw-r--r--fuzzycat/verify.py5
4 files changed, 41 insertions, 14 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 276f3b2..bbab024 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1,2 +1 @@
__version__ = "0.1.4"
-
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 3282f3b..3d39a91 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -3,9 +3,15 @@ Clustering stage.
"""
import functools
+import fileinput
import operator
import re
import sys
+import tempfile
+import json
+import os
+import subprocess
+import itertools
import fuzzy
@@ -21,6 +27,7 @@ get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile('[\W_]+', re.UNICODE)
+
def release_key_title(re):
id, title = get_ident_title(re)
if not title:
@@ -28,14 +35,17 @@ def release_key_title(re):
title = title.translate(ws_replacer).strip()
return (id, title)
+
def release_key_title_normalized(re):
id, title = release_key_title(re)
return (id, non_word_re.sub('', title))
+
def release_key_title_nysiis(re):
id, title = release_key_title(re)
return (id, fuzzy.nysiis(title))
+
def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
@@ -51,6 +61,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
return tf.name
+
def group_by(filename, key=None, value=None, comment=""):
"""
Iterate over lines in filename, group by key (a callable deriving the key
@@ -65,23 +76,32 @@ def group_by(filename, key=None, value=None, comment=""):
}
yield doc
+
def cut(f=0, sep='\t'):
"""
Return a callable, that extracts a given column from a file with a specific
separator. TODO: move this into more generic place.
"""
- def f(value):
- parts = value.split(sep)
- if len(parts) > f + 1:
+ def func(value):
+ parts = value.strip().split(sep)
+ if len(parts) + 1 < f:
raise ValueError('cannot split value into {} parts'.format(f))
return parts[f]
- return f
+
+ return func
+
class Cluster:
"""
Cluster scaffold for release entities.
"""
- def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None):
+ def __init__(self,
+ files="-",
+ output=sys.stdout,
+ keyfunc=lambda v: v,
+ prefix='fuzzycat-',
+ tmpdir=None,
+ verbose=False):
"""
Files can be a list of files or "-" for stdin.
"""
@@ -90,14 +110,17 @@ class Cluster:
self.output = output
self.prefix = prefix
self.tmpdir = tmpdir
+ self.verbose = verbose
def run(self):
"""
Run clustering and write output to given stream or file.
"""
- keyfunc = self.keyfunc # Save a lookup in loop.
+ keyfunc = self.keyfunc # Save a lookup in loop.
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
- for line in fileinput.input(files=files):
+ for i, line in enumerate(fileinput.input(files=self.files)):
+ if self.verbose and i % 100000 == 0:
+ print("{}".format(i), file=sys.stderr)
try:
id, key = keyfunc(json.loads(line))
print("{}\t{}".format(id, key), file=tf)
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index e1f4236..2be07cb 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -4,8 +4,7 @@ import tempfile
import elasticsearch
-from fuzzycat.cluster import (Cluster, release_key_title,
- release_key_title_normalized,
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -15,12 +14,18 @@ def run_cluster(args):
'tnorm': release_key_title_normalized,
'tnysi': release_key_title_nysiis,
}
- cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix)
+ cluster = Cluster(files=args.files,
+ keyfunc=types.get(args.type),
+ tmpdir=args.tmpdir,
+ prefix=args.prefix,
+ verbose=args.verbose)
cluster.run()
+
def run_verify(args):
print('verify')
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='fuzzycat',
usage='%(prog)s command [options]',
@@ -28,6 +33,7 @@ if __name__ == '__main__':
parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
+ parser.add_argument('--verbose', default=False, action='store_true', help='be verbose')
subparsers = parser.add_subparsers()
sub_cluster = subparsers.add_parser('cluster', help='group entities')
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index d4677b0..9f5eaa8 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -24,6 +24,7 @@ store, or some other cache
"""
+
def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
"""
Fetches a single release entity.
@@ -31,11 +32,9 @@ def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
link = "https://api.fatcat.wiki/v0/release/{}".format(ident)
return requests.get(link).json()
+
def ident_to_release_entities(ids):
"""
Turn a list of ids into release entities.
"""
return [fetch_release_entity(id) for id in ids]
-
-
-