aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py43
1 files changed, 23 insertions, 20 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index a28cf58..3b7f3f5 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -37,6 +37,7 @@ import subprocess
import tempfile
import re
import string
+import operator
import orjson as json
import fuzzy
@@ -83,16 +84,17 @@ def cluster_by_title(args):
Basic example for a three stage process: extract, sort, group. Speed is
about: 20K/s (json roundtrip, sorting, grouping).
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files)
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = title.replace("\t", " ").replace("\n", " ").strip()
+ title = title.replace("\t", " ").replace("\n", " ").strip()
except KeyError as err:
continue
print("%s\t%s" % (id, title), file=tf)
@@ -108,18 +110,19 @@ def cluster_by_title_normalized(args):
"""
Normalize title, e.g. analysisofheritability. 17k/s.
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
pattern = re.compile('[\W_]+', re.UNICODE)
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files):
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = title.replace("\t", " ").replace("\n", " ").strip().lower()
- title = pattern.sub('', title)
+ title = title.replace("\t", " ").replace("\n", " ").strip().lower()
+ title = pattern.sub('', title)
except KeyError as err:
continue
print("%s\t%s" % (id, title), file=tf)
@@ -135,19 +138,19 @@ def cluster_by_title_nysiis(args):
"""
Soundex on title.
"""
+ files = args.files if len(args.files) > 0 else ('-', )
+ fg = operator.itemgetter("ident", "title")
+
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=args.tmp_prefix) as tf:
- for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
- doc = json.loads(line)
+ for line in fileinput.input(files=files):
try:
- id = doc["ident"]
- title = doc["title"]
+ doc = json.loads(line)
+ id, title = fg(doc)
if not title:
continue
- else:
- title = fuzzy.nysiis(title)
+ title = fuzzy.nysiis(title)
except KeyError as err:
continue
-
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")