diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-12 20:45:17 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-12 20:45:17 +0100 |
commit | 4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (patch) | |
tree | c326180ad8e5431026cd2f46da571307d226aa38 /fuzzycat/__main__.py | |
parent | 7c1927dd2800069b74bbe2f561127122daa0870f (diff) | |
parent | 30eab70787584a333714b18f1d64f362e4768730 (diff) | |
download | fuzzycat-4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699.tar.gz fuzzycat-4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699.zip |
Merge branch 'bnewbold-sandcrawler' of https://github.com/bnewbold/fuzzycat into bnewbold-bnewbold-sandcrawler
* 'bnewbold-sandcrawler' of https://github.com/bnewbold/fuzzycat:
sandcrawler slugify: yet more unicode corner-cases
add sandcrawler-style title key method
cluster: count empty keys (and don't return them)
pipenv: explicit regex dependency
gitignore: add .swp (vim)
make: run pytest over fuzzycat/ to catch inline tests
add support for key denylist
Diffstat (limited to 'fuzzycat/__main__.py')
-rw-r--r-- | fuzzycat/__main__.py | 37 |
1 files changed, 10 insertions, 27 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 3845245..a65eb63 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -20,9 +20,9 @@ import pstats import sys import tempfile -from fuzzycat.build import NgramLookup, TitleTokenList from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, - release_key_title_normalized, release_key_title_nysiis) + release_key_title_normalized, release_key_title_nysiis, + release_key_title_sandcrawler) def run_cluster(args): @@ -32,10 +32,16 @@ def run_cluster(args): 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, + 'tsandcrawler': release_key_title_sandcrawler, } + key_denylist = None + if args.key_denylist: + with open(args.key_denylist, 'r') as f: + key_denylist = [l.strip() for l in f.readlines()] cluster = Cluster(iterable=fileinput.input(args.files), key=types.get(args.type), tmpdir=args.tmpdir, + key_denylist=key_denylist, prefix=args.prefix) stats = cluster.run() logger.debug(json.dumps(dict(stats))) @@ -49,20 +55,6 @@ def run_verify(args): pass -def run_build(args): - """ - Trying out. - """ - if args.type == "ss": - builder = NgramLookup(files=args.files, output=args.output) - builder.run() - elif args.type == "tt": - builder = TitleTokenList(files=args.files, output=args.output) - builder.run() - else: - raise NotImplementedError() - - if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, @@ -82,25 +74,16 @@ if __name__ == '__main__': sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-f', '--files', default="-", help='input files') + sub_cluster.add_argument('--key-denylist', help='file path to key denylist') sub_cluster.add_argument('-t', '--type', default='title', - help='cluster algorithm: title, tnorm, tnysi, tss') + help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler') sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser]) sub_verify.add_argument('-f', '--files', default="-", help='input files') sub_verify.set_defaults(func=run_verify) - sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser]) - sub_build.add_argument('-f', '--files', default="-", help='input files') - sub_build.add_argument('-t', '--type', default="ss", help='type of database to build') - sub_build.add_argument('-o', - '--output', - type=argparse.FileType('w'), - default=sys.stdout, - help='output file') - sub_build.set_defaults(func=run_build) - args = parser.parse_args() if not args.__dict__.get("func"): print(__doc__, file=sys.stderr) |