aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/__main__.py
blob: a65eb633f59f93906dc324e5c6ec0914d468b4b5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
"""Usage: fuzzycat COMMAND [options]

Commands: cluster, verify

Run, e.g. fuzzycat cluster --help for more options. Example:

    $ zstdcat -T0 release_export_expanded.json.zst |
      parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
      python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
"""

import argparse
import cProfile as profile
import fileinput
import io
import json
import logging
import pstats
import sys
import tempfile

from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
                              release_key_title_normalized, release_key_title_nysiis,
                              release_key_title_sandcrawler)


def run_cluster(args):
    logger = logging.getLogger('main.run_cluster')
    types = {
        'title': release_key_title,
        'tnorm': release_key_title_normalized,
        'tnysi': release_key_title_nysiis,
        'tss': release_key_title_ngram,
        'tsandcrawler': release_key_title_sandcrawler,
    }
    key_denylist = None
    if args.key_denylist:
        with open(args.key_denylist, 'r') as f:
            key_denylist = [l.strip() for l in f.readlines()]
    cluster = Cluster(iterable=fileinput.input(args.files),
                      key=types.get(args.type),
                      tmpdir=args.tmpdir,
                      key_denylist=key_denylist,
                      prefix=args.prefix)
    stats = cluster.run()
    logger.debug(json.dumps(dict(stats)))


def run_verify(args):
    """
    TODO. Ok, we should not fetch data we have on disk (at the clustering
    step).
    """
    pass


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    parser = argparse.ArgumentParser(prog='fuzzycat',
                                     description=__doc__,
                                     usage='%(prog)s command [options]',
                                     add_help=False,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
    parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
    parser.add_argument('-P', '--profile', action='store_true', help='profile program')
    subparsers = parser.add_subparsers()

    sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
    sub_cluster.set_defaults(func=run_cluster)
    sub_cluster.add_argument('-f', '--files', default="-", help='input files')
    sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
    sub_cluster.add_argument('-t',
                             '--type',
                             default='title',
                             help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler')

    sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
    sub_verify.add_argument('-f', '--files', default="-", help='input files')
    sub_verify.set_defaults(func=run_verify)

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print(__doc__, file=sys.stderr)
        sys.exit(1)

    if args.profile:
        logging.disable(logging.DEBUG)
        pr = profile.Profile()
        pr.enable()

    args.func(args)

    if args.profile:
        pr.disable()
        s = io.StringIO()
        ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
        ps.print_stats()
        print(s.getvalue(), file=sys.stderr)