aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-04 20:42:54 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-04 20:42:54 +0100
commit321beac2b8b724532103ccc872becda33f33cd77 (patch)
treeff6500d694b5e89935b6e22f63916db886ed9474 /fuzzycat
parent99228171f2e0668de744dae71ad66085234be356 (diff)
downloadfuzzycat-321beac2b8b724532103ccc872becda33f33cd77.tar.gz
fuzzycat-321beac2b8b724532103ccc872becda33f33cd77.zip
address pylint issues
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py16
-rw-r--r--fuzzycat/main.py26
2 files changed, 22 insertions, 20 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 4d5580d..ee19611 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -1,17 +1,17 @@
+# pylint: disable=C0103
"""
Clustering stage.
"""
-import functools
import fileinput
+import itertools
+import json
import operator
+import os
import re
+import subprocess
import sys
import tempfile
-import json
-import os
-import subprocess
-import itertools
import fuzzy
@@ -19,13 +19,13 @@ __all__ = [
"release_key_title",
"release_key_title_normalized",
"release_key_title_nysiis",
- "sort_file_by_column",
+ "sort_by_column",
"group_by",
]
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
-non_word_re = re.compile('[\W_]+', re.UNICODE)
+non_word_re = re.compile(r'[\W_]+', re.UNICODE)
def release_key_title(re):
@@ -57,7 +57,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
env["TMPDIR"] = tmpdir
if fast:
env["LC_ALL"] = "C"
- subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env)
+ subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True)
return tf.name
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5279eee..5eaa4a2 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -1,21 +1,19 @@
#!/usr/bin/env python
+"""Usage: fuzzycat COMMAND [options]
-"""
-Command line clustering tool.
+Commands: cluster, verify
-Example usage:
+Run, e.g. fuzzycat cluster --help for more options. Example:
- $ zstdcat -T0 release_export_expanded.json.zst | \
- parallel --tmpdir /bigger/tmp --roundrobin --pipe -j 4 \
- python -m fuzzycat.main --tmpdir /bigger/tmp -t tnorm
+ $ zstdcat -T0 release_export_expanded.json.zst |
+ parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
+ python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
"""
import argparse
import sys
import tempfile
-import elasticsearch
-
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -42,6 +40,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='fuzzycat',
description=__doc__,
usage='%(prog)s command [options]',
+ add_help=False,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
@@ -49,17 +48,20 @@ if __name__ == '__main__':
parser.add_argument('--verbose', default=False, action='store_true', help='be verbose')
subparsers = parser.add_subparsers()
- sub_cluster = subparsers.add_parser('cluster', help='group entities')
+ sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
sub_cluster.set_defaults(func=run_cluster)
sub_cluster.add_argument('-f', '--files', default="-", help='output files')
- sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm: title, tnorm, tnysi')
+ sub_cluster.add_argument('-t',
+ '--type',
+ default='title',
+ help='cluster algorithm: title, tnorm, tnysi')
- sub_verify = subparsers.add_parser('verify', help='verify groups')
+ sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
sub_verify.set_defaults(func=run_verify)
args = parser.parse_args()
if not args.__dict__.get("func"):
- print('fuzzycat: use -h or --help for usage', file=sys.stderr)
+ print(__doc__, file=sys.stderr)
sys.exit(1)
args.func(args)