From 03f9d780824bd6a7015633d733326c2cae45c66b Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 22 Oct 2020 00:53:54 +0200 Subject: access values at once with itemgetter --- fuzzycat/cluster.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index a28cf58..3b7f3f5 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -37,6 +37,7 @@ import subprocess import tempfile import re import string +import operator import orjson as json import fuzzy @@ -83,16 +84,17 @@ def cluster_by_title(args): Basic example for a three stage process: extract, sort, group. Speed is about: 20K/s (json roundtrip, sorting, grouping). """ + files = args.files if len(args.files) > 0 else ('-', ) + fg = operator.itemgetter("ident", "title") + with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf: - for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): - doc = json.loads(line) + for line in fileinput.input(files=files) try: - id = doc["ident"] - title = doc["title"] + doc = json.loads(line) + id, title = fg(doc) if not title: continue - else: - title = title.replace("\t", " ").replace("\n", " ").strip() + title = title.replace("\t", " ").replace("\n", " ").strip() except KeyError as err: continue print("%s\t%s" % (id, title), file=tf) @@ -108,18 +110,19 @@ def cluster_by_title_normalized(args): """ Normalize title, e.g. analysisofheritability. 17k/s. """ + files = args.files if len(args.files) > 0 else ('-', ) + fg = operator.itemgetter("ident", "title") pattern = re.compile('[\W_]+', re.UNICODE) + with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf: - for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): - doc = json.loads(line) + for line in fileinput.input(files=files): try: - id = doc["ident"] - title = doc["title"] + doc = json.loads(line) + id, title = fg(doc) if not title: continue - else: - title = title.replace("\t", " ").replace("\n", " ").strip().lower() - title = pattern.sub('', title) + title = title.replace("\t", " ").replace("\n", " ").strip().lower() + title = pattern.sub('', title) except KeyError as err: continue print("%s\t%s" % (id, title), file=tf) @@ -135,19 +138,19 @@ def cluster_by_title_nysiis(args): """ Soundex on title. """ + files = args.files if len(args.files) > 0 else ('-', ) + fg = operator.itemgetter("ident", "title") + with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf: - for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): - doc = json.loads(line) + for line in fileinput.input(files=files): try: - id = doc["ident"] - title = doc["title"] + doc = json.loads(line) + id, title = fg(doc) if not title: continue - else: - title = fuzzy.nysiis(title) + title = fuzzy.nysiis(title) except KeyError as err: continue - print("%s\t%s" % (id, title), file=tf) sbc = sort_by_column(tf.name, opts="-k 2") -- cgit v1.2.3