aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-10-22 00:53:54 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-10-22 00:53:54 +0200
commit03f9d780824bd6a7015633d733326c2cae45c66b (patch)
treef776fd0104cb0e37aed53acf3ab334e39b7137bd
parent48edbef5b4d9b204110c0542615dbd7084e5c767 (diff)
downloadfuzzycat-03f9d780824bd6a7015633d733326c2cae45c66b.tar.gz
fuzzycat-03f9d780824bd6a7015633d733326c2cae45c66b.zip
access values at once with itemgetter
-rw-r--r--fuzzycat/cluster.py43
1 files changed, 23 insertions, 20 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index a28cf58..3b7f3f5 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -37,6 +37,7 @@ import subprocess
import tempfile
import re
import string
+import operator
import orjson as json
import fuzzy
@@ -83,16 +84,17 @@ def cluster_by_title(args):
Basic example for a three stage process: extract, sort, group. Speed is
about: 20K/s (json roundtrip, sorting, grouping).
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files)
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = title.replace("\t", " ").replace("\n", " ").strip()
+ title = title.replace("\t", " ").replace("\n", " ").strip()
except KeyError as err:
continue
print("%s\t%s" % (id, title), file=tf)
@@ -108,18 +110,19 @@ def cluster_by_title_normalized(args):
"""
Normalize title, e.g. analysisofheritability. 17k/s.
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
pattern = re.compile('[\W_]+', re.UNICODE)
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files):
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = title.replace("\t", " ").replace("\n", " ").strip().lower()
- title = pattern.sub('', title)
+ title = title.replace("\t", " ").replace("\n", " ").strip().lower()
+ title = pattern.sub('', title)
except KeyError as err:
continue
print("%s\t%s" % (id, title), file=tf)
@@ -135,19 +138,19 @@ def cluster_by_title_nysiis(args):
"""
Soundex on title.
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files):
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = fuzzy.nysiis(title)
+ title = fuzzy.nysiis(title)
except KeyError as err:
continue
-
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")