aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/__main__.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-12 00:23:18 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-12 00:23:22 +0100
commit037b077efba7ee351fe2c6b5d0217c6d426261e0 (patch)
treeed003b0917f626cbf081c04f79de5fdc766b505d /fuzzycat/__main__.py
parentf0b5857b45b0982cb587fce6b847d1d039794636 (diff)
downloadfuzzycat-037b077efba7ee351fe2c6b5d0217c6d426261e0.tar.gz
fuzzycat-037b077efba7ee351fe2c6b5d0217c6d426261e0.zip
move main.py to __main__.py
Diffstat (limited to 'fuzzycat/__main__.py')
-rw-r--r--fuzzycat/__main__.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
new file mode 100644
index 0000000..900d5c0
--- /dev/null
+++ b/fuzzycat/__main__.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+"""Usage: fuzzycat COMMAND [options]
+
+Commands: cluster, verify
+
+Run, e.g. fuzzycat cluster --help for more options. Example:
+
+ $ zstdcat -T0 release_export_expanded.json.zst |
+ parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
+ python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
+"""
+
+import argparse
+import cProfile as profile
+import fileinput
+import json
+import io
+import logging
+import pstats
+import sys
+import tempfile
+
+from fuzzycat.build import NgramLookup, TitleTokenList
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
+ release_key_title_normalized, release_key_title_nysiis)
+
+
+def run_cluster(args):
+ logger = logging.getLogger('main.run_cluster')
+ types = {
+ 'title': release_key_title,
+ 'tnorm': release_key_title_normalized,
+ 'tnysi': release_key_title_nysiis,
+ 'tss': release_key_title_ngram,
+ }
+ cluster = Cluster(files=args.files,
+ keyfunc=types.get(args.type),
+ tmpdir=args.tmpdir,
+ prefix=args.prefix)
+ stats = cluster.run()
+ logger.debug(json.dumps(dict(stats)))
+
+
+def run_verify(args):
+ """
+ TODO. Ok, we should not fetch data we have on disk (at the clustering
+ step).
+ """
+ pass
+
+
+def run_build(args):
+ """
+ Trying out.
+ """
+ if args.type == "ss":
+ builder = NgramLookup(files=args.files, output=args.output)
+ builder.run()
+ elif args.type == "tt":
+ builder = TitleTokenList(files=args.files, output=args.output)
+ builder.run()
+ else:
+ raise NotImplementedError()
+
+
+if __name__ == '__main__':
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S')
+ parser = argparse.ArgumentParser(prog='fuzzycat',
+ description=__doc__,
+ usage='%(prog)s command [options]',
+ add_help=False,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
+ parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
+ parser.add_argument('-P', '--profile', action='store_true', help='profile program')
+ subparsers = parser.add_subparsers()
+
+ sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
+ sub_cluster.set_defaults(func=run_cluster)
+ sub_cluster.add_argument('-f', '--files', default="-", help='input files')
+ sub_cluster.add_argument('-t',
+ '--type',
+ default='title',
+ help='cluster algorithm: title, tnorm, tnysi, tss')
+
+ sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
+ sub_verify.add_argument('-f', '--files', default="-", help='input files')
+ sub_verify.set_defaults(func=run_verify)
+
+ sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
+ sub_build.add_argument('-f', '--files', default="-", help='input files')
+ sub_build.add_argument('-t', '--type', default="ss", help='type of database to build')
+ sub_build.add_argument('-o',
+ '--output',
+ type=argparse.FileType('w'),
+ default=sys.stdout,
+ help='output file')
+ sub_build.set_defaults(func=run_build)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print(__doc__, file=sys.stderr)
+ sys.exit(1)
+
+ if args.profile:
+ logging.disable(logging.DEBUG)
+ pr = profile.Profile()
+ pr.enable()
+
+ args.func(args)
+
+ if args.profile:
+ pr.disable()
+ s = io.StringIO()
+ ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
+ ps.print_stats()
+ print(s.getvalue(), file=sys.stderr)