diff options
-rw-r--r-- | .pylintrc | 3 | ||||
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 16 | ||||
-rw-r--r-- | fuzzycat/main.py | 26 | ||||
-rw-r--r-- | tests/test_cluster.py | 1 |
5 files changed, 30 insertions, 21 deletions
diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..72e94cb --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MESSAGES CONTROL] + +disable=C0301 @@ -1,5 +1,6 @@ SHELL := /bin/bash FATCAT_BULK_EXPORT_ITEM := fatcat_bulk_exports_2020-08-05 +PY_FILES := $(shell find fuzzycat -name '*.py') .PHONY: help help: ## Print info about all commands @@ -28,6 +29,10 @@ dist: ## Create source distribution and wheel cov: ## Run coverage report pytest --cov=fuzzycat tests/ +.PHONY: lint +lint: $(PY_FILES) + pylint fuzzycat + .PHONY: clean clean: ## Clean all artifacts rm -rf build diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 4d5580d..ee19611 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -1,17 +1,17 @@ +# pylint: disable=C0103 """ Clustering stage. """ -import functools import fileinput +import itertools +import json import operator +import os import re +import subprocess import sys import tempfile -import json -import os -import subprocess -import itertools import fuzzy @@ -19,13 +19,13 @@ __all__ = [ "release_key_title", "release_key_title_normalized", "release_key_title_nysiis", - "sort_file_by_column", + "sort_by_column", "group_by", ] get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) -non_word_re = re.compile('[\W_]+', re.UNICODE) +non_word_re = re.compile(r'[\W_]+', re.UNICODE) def release_key_title(re): @@ -57,7 +57,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat- env["TMPDIR"] = tmpdir if fast: env["LC_ALL"] = "C" - subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env) + subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True) return tf.name diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 5279eee..5eaa4a2 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -1,21 +1,19 @@ #!/usr/bin/env python +"""Usage: fuzzycat COMMAND [options] -""" -Command line clustering tool. +Commands: cluster, verify -Example usage: +Run, e.g. fuzzycat cluster --help for more options. Example: - $ zstdcat -T0 release_export_expanded.json.zst | \ - parallel --tmpdir /bigger/tmp --roundrobin --pipe -j 4 \ - python -m fuzzycat.main --tmpdir /bigger/tmp -t tnorm + $ zstdcat -T0 release_export_expanded.json.zst | + parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 | + python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl """ import argparse import sys import tempfile -import elasticsearch - from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis) @@ -42,6 +40,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(prog='fuzzycat', description=__doc__, usage='%(prog)s command [options]', + add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix') @@ -49,17 +48,20 @@ if __name__ == '__main__': parser.add_argument('--verbose', default=False, action='store_true', help='be verbose') subparsers = parser.add_subparsers() - sub_cluster = subparsers.add_parser('cluster', help='group entities') + sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-f', '--files', default="-", help='output files') - sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm: title, tnorm, tnysi') + sub_cluster.add_argument('-t', + '--type', + default='title', + help='cluster algorithm: title, tnorm, tnysi') - sub_verify = subparsers.add_parser('verify', help='verify groups') + sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser]) sub_verify.set_defaults(func=run_verify) args = parser.parse_args() if not args.__dict__.get("func"): - print('fuzzycat: use -h or --help for usage', file=sys.stderr) + print(__doc__, file=sys.stderr) sys.exit(1) args.func(args) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 139597f..8b13789 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,2 +1 @@ - |