aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/__main__.py
blob: 900d5c06c416c31990e74fa5dbeae6e76c9b8a4d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
"""Usage: fuzzycat COMMAND [options]

Commands: cluster, verify

Run, e.g. fuzzycat cluster --help for more options. Example:

    $ zstdcat -T0 release_export_expanded.json.zst |
      parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
      python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
"""

import argparse
import cProfile as profile
import fileinput
import json
import io
import logging
import pstats
import sys
import tempfile

from fuzzycat.build import NgramLookup, TitleTokenList
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
                              release_key_title_normalized, release_key_title_nysiis)


def run_cluster(args):
    logger = logging.getLogger('main.run_cluster')
    types = {
        'title': release_key_title,
        'tnorm': release_key_title_normalized,
        'tnysi': release_key_title_nysiis,
        'tss': release_key_title_ngram,
    }
    cluster = Cluster(files=args.files,
                      keyfunc=types.get(args.type),
                      tmpdir=args.tmpdir,
                      prefix=args.prefix)
    stats = cluster.run()
    logger.debug(json.dumps(dict(stats)))


def run_verify(args):
    """
    TODO. Ok, we should not fetch data we have on disk (at the clustering
    step).
    """
    pass


def run_build(args):
    """
    Trying out.
    """
    if args.type == "ss":
        builder = NgramLookup(files=args.files, output=args.output)
        builder.run()
    elif args.type == "tt":
        builder = TitleTokenList(files=args.files, output=args.output)
        builder.run()
    else:
        raise NotImplementedError()


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    parser = argparse.ArgumentParser(prog='fuzzycat',
                                     description=__doc__,
                                     usage='%(prog)s command [options]',
                                     add_help=False,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
    parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
    parser.add_argument('-P', '--profile', action='store_true', help='profile program')
    subparsers = parser.add_subparsers()

    sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
    sub_cluster.set_defaults(func=run_cluster)
    sub_cluster.add_argument('-f', '--files', default="-", help='input files')
    sub_cluster.add_argument('-t',
                             '--type',
                             default='title',
                             help='cluster algorithm: title, tnorm, tnysi, tss')

    sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
    sub_verify.add_argument('-f', '--files', default="-", help='input files')
    sub_verify.set_defaults(func=run_verify)

    sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
    sub_build.add_argument('-f', '--files', default="-", help='input files')
    sub_build.add_argument('-t', '--type', default="ss", help='type of database to build')
    sub_build.add_argument('-o',
                           '--output',
                           type=argparse.FileType('w'),
                           default=sys.stdout,
                           help='output file')
    sub_build.set_defaults(func=run_build)

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print(__doc__, file=sys.stderr)
        sys.exit(1)

    if args.profile:
        logging.disable(logging.DEBUG)
        pr = profile.Profile()
        pr.enable()

    args.func(args)

    if args.profile:
        pr.disable()
        s = io.StringIO()
        ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
        ps.print_stats()
        print(s.getvalue(), file=sys.stderr)