aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.pylintrc3
-rw-r--r--Makefile5
-rw-r--r--fuzzycat/cluster.py16
-rw-r--r--fuzzycat/main.py26
-rw-r--r--tests/test_cluster.py1
5 files changed, 30 insertions, 21 deletions
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..72e94cb
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,3 @@
+[MESSAGES CONTROL]
+
+disable=C0301
diff --git a/Makefile b/Makefile
index 3e98846..80cf6a2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
SHELL := /bin/bash
FATCAT_BULK_EXPORT_ITEM := fatcat_bulk_exports_2020-08-05
+PY_FILES := $(shell find fuzzycat -name '*.py')
.PHONY: help
help: ## Print info about all commands
@@ -28,6 +29,10 @@ dist: ## Create source distribution and wheel
cov: ## Run coverage report
pytest --cov=fuzzycat tests/
+.PHONY: lint
+lint: $(PY_FILES)
+ pylint fuzzycat
+
.PHONY: clean
clean: ## Clean all artifacts
rm -rf build
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 4d5580d..ee19611 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -1,17 +1,17 @@
+# pylint: disable=C0103
"""
Clustering stage.
"""
-import functools
import fileinput
+import itertools
+import json
import operator
+import os
import re
+import subprocess
import sys
import tempfile
-import json
-import os
-import subprocess
-import itertools
import fuzzy
@@ -19,13 +19,13 @@ __all__ = [
"release_key_title",
"release_key_title_normalized",
"release_key_title_nysiis",
- "sort_file_by_column",
+ "sort_by_column",
"group_by",
]
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
-non_word_re = re.compile('[\W_]+', re.UNICODE)
+non_word_re = re.compile(r'[\W_]+', re.UNICODE)
def release_key_title(re):
@@ -57,7 +57,7 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
env["TMPDIR"] = tmpdir
if fast:
env["LC_ALL"] = "C"
- subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env)
+ subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True)
return tf.name
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 5279eee..5eaa4a2 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -1,21 +1,19 @@
#!/usr/bin/env python
+"""Usage: fuzzycat COMMAND [options]
-"""
-Command line clustering tool.
+Commands: cluster, verify
-Example usage:
+Run, e.g. fuzzycat cluster --help for more options. Example:
- $ zstdcat -T0 release_export_expanded.json.zst | \
- parallel --tmpdir /bigger/tmp --roundrobin --pipe -j 4 \
- python -m fuzzycat.main --tmpdir /bigger/tmp -t tnorm
+ $ zstdcat -T0 release_export_expanded.json.zst |
+ parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
+ python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
"""
import argparse
import sys
import tempfile
-import elasticsearch
-
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -42,6 +40,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='fuzzycat',
description=__doc__,
usage='%(prog)s command [options]',
+ add_help=False,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
@@ -49,17 +48,20 @@ if __name__ == '__main__':
parser.add_argument('--verbose', default=False, action='store_true', help='be verbose')
subparsers = parser.add_subparsers()
- sub_cluster = subparsers.add_parser('cluster', help='group entities')
+ sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
sub_cluster.set_defaults(func=run_cluster)
sub_cluster.add_argument('-f', '--files', default="-", help='output files')
- sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm: title, tnorm, tnysi')
+ sub_cluster.add_argument('-t',
+ '--type',
+ default='title',
+ help='cluster algorithm: title, tnorm, tnysi')
- sub_verify = subparsers.add_parser('verify', help='verify groups')
+ sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
sub_verify.set_defaults(func=run_verify)
args = parser.parse_args()
if not args.__dict__.get("func"):
- print('fuzzycat: use -h or --help for usage', file=sys.stderr)
+ print(__doc__, file=sys.stderr)
sys.exit(1)
args.func(args)
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 139597f..8b13789 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,2 +1 @@
-