aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/__main__.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-12 20:45:17 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-12 20:45:17 +0100
commit4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699 (patch)
treec326180ad8e5431026cd2f46da571307d226aa38 /fuzzycat/__main__.py
parent7c1927dd2800069b74bbe2f561127122daa0870f (diff)
parent30eab70787584a333714b18f1d64f362e4768730 (diff)
downloadfuzzycat-4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699.tar.gz
fuzzycat-4eab32b1c5929b2d7f2e6d8fed76bdb49bf9c699.zip
Merge branch 'bnewbold-sandcrawler' of https://github.com/bnewbold/fuzzycat into bnewbold-bnewbold-sandcrawler
* 'bnewbold-sandcrawler' of https://github.com/bnewbold/fuzzycat: sandcrawler slugify: yet more unicode corner-cases add sandcrawler-style title key method cluster: count empty keys (and don't return them) pipenv: explicit regex dependency gitignore: add .swp (vim) make: run pytest over fuzzycat/ to catch inline tests add support for key denylist
Diffstat (limited to 'fuzzycat/__main__.py')
-rw-r--r--fuzzycat/__main__.py37
1 files changed, 10 insertions, 27 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 3845245..a65eb63 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -20,9 +20,9 @@ import pstats
import sys
import tempfile
-from fuzzycat.build import NgramLookup, TitleTokenList
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
- release_key_title_normalized, release_key_title_nysiis)
+ release_key_title_normalized, release_key_title_nysiis,
+ release_key_title_sandcrawler)
def run_cluster(args):
@@ -32,10 +32,16 @@ def run_cluster(args):
'tnorm': release_key_title_normalized,
'tnysi': release_key_title_nysiis,
'tss': release_key_title_ngram,
+ 'tsandcrawler': release_key_title_sandcrawler,
}
+ key_denylist = None
+ if args.key_denylist:
+ with open(args.key_denylist, 'r') as f:
+ key_denylist = [l.strip() for l in f.readlines()]
cluster = Cluster(iterable=fileinput.input(args.files),
key=types.get(args.type),
tmpdir=args.tmpdir,
+ key_denylist=key_denylist,
prefix=args.prefix)
stats = cluster.run()
logger.debug(json.dumps(dict(stats)))
@@ -49,20 +55,6 @@ def run_verify(args):
pass
-def run_build(args):
- """
- Trying out.
- """
- if args.type == "ss":
- builder = NgramLookup(files=args.files, output=args.output)
- builder.run()
- elif args.type == "tt":
- builder = TitleTokenList(files=args.files, output=args.output)
- builder.run()
- else:
- raise NotImplementedError()
-
-
if __name__ == '__main__':
logging.basicConfig(
level=logging.DEBUG,
@@ -82,25 +74,16 @@ if __name__ == '__main__':
sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
sub_cluster.set_defaults(func=run_cluster)
sub_cluster.add_argument('-f', '--files', default="-", help='input files')
+ sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
sub_cluster.add_argument('-t',
'--type',
default='title',
- help='cluster algorithm: title, tnorm, tnysi, tss')
+ help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler')
sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
sub_verify.add_argument('-f', '--files', default="-", help='input files')
sub_verify.set_defaults(func=run_verify)
- sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
- sub_build.add_argument('-f', '--files', default="-", help='input files')
- sub_build.add_argument('-t', '--type', default="ss", help='type of database to build')
- sub_build.add_argument('-o',
- '--output',
- type=argparse.FileType('w'),
- default=sys.stdout,
- help='output file')
- sub_build.set_defaults(func=run_build)
-
args = parser.parse_args()
if not args.__dict__.get("func"):
print(__doc__, file=sys.stderr)