aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-10-18 20:25:53 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-10-21 03:47:23 +0200
commite33a0f359dd36284c31eb619c6eddd617ef3a779 (patch)
treece1b240455c20673118e0ec9cbb3167f67a25980 /fuzzycat
parent26aa121848d41860a398cac8b549531e5f21f03e (diff)
downloadfuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.tar.gz
fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.zip
cluster variants
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py163
-rw-r--r--fuzzycat/fatcat/main.py4
-rw-r--r--fuzzycat/fatcat/matching.py10
3 files changed, 171 insertions, 6 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
new file mode 100644
index 0000000..94e42e3
--- /dev/null
+++ b/fuzzycat/cluster.py
@@ -0,0 +1,163 @@
+"""
+Clustering part of matching.
+
+We want to have generic and fast way to derive various clusters. Input is a
+json lines of release entities, e.g. from a database dump.
+
+Map and reduce.
+
+* input (json) blob -> (ident, value) -> group by value -> emit idents per group
+
+"""
+
+import argparse
+import fileinput
+import itertools
+import json
+import os
+import subprocess
+import tempfile
+import re
+import string
+
+import orjson as json
+import fuzzy
+
+DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat")
+
+
+def sort_by_column(filename, mode="w", opts="-k 2", fast=True):
+ """
+ Sort tabular file with sort(1), returns the filename of the sorted file.
+ """
+ with tempfile.NamedTemporaryFile(delete=False, mode=mode) as tf:
+ env = os.environ.copy()
+ if fast:
+ env["LC_ALL"] = "C"
+ subprocess.run(["sort"] + opts.split() + [filename], stdout=tf)
+
+ return tf.name
+
+def group_by_column(filename, key=None, value=None, comment=""):
+ """
+ Group a sorted file with given key function. Use another function to
+ extract the value.
+ """
+ with open(filename) as f:
+ for k, g in itertools.groupby(f, key=key):
+ doc = {
+ "v": [value(v) for v in g],
+ "c": comment,
+ "k": k.strip(),
+ }
+ yield doc
+
+# XXX: LineOps
+
+def cut(f=0, sep='\t'):
+ """
+ Similar to cut(1), but zero indexed.
+ """
+ return lambda v: v.split(sep)[f]
+
+def cluster_by_title(args):
+ """
+ Basic example for a three stage process: extract, sort, group. Speed is
+ about: 20K/s (json roundtrip, sorting, grouping).
+ """
+ with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+ for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+ doc = json.loads(line)
+ try:
+ id = doc["ident"]
+ title = doc["title"]
+ if not title:
+ continue
+ else:
+ title = title.replace("\t", " ").replace("\n", " ").strip()
+ except KeyError as err:
+ continue
+ print("%s\t%s" % (id, title), file=tf)
+
+ sbc = sort_by_column(tf.name, opts="-k 2")
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ print(json.dumps(doc).decode("utf-8"))
+
+ os.remove(sbc)
+ os.remove(tf.name)
+
+def cluster_by_title_normalized(args):
+ """
+ Normalize title, e.g. analysisofheritability. 17k/s.
+ """
+ pattern = re.compile('[\W_]+', re.UNICODE)
+ with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+ for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+ doc = json.loads(line)
+ try:
+ id = doc["ident"]
+ title = doc["title"]
+ if not title:
+ continue
+ else:
+ title = title.replace("\t", " ").replace("\n", " ").strip().lower()
+ title = pattern.sub('', title)
+ except KeyError as err:
+ continue
+ print("%s\t%s" % (id, title), file=tf)
+
+ sbc = sort_by_column(tf.name, opts="-k 2")
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ print(json.dumps(doc).decode("utf-8"))
+
+ os.remove(sbc)
+ os.remove(tf.name)
+
+def cluster_by_title_nysiis(args):
+ """
+ Soundex on title.
+ """
+ with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+ for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+ doc = json.loads(line)
+ try:
+ id = doc["ident"]
+ title = doc["title"]
+ if not title:
+ continue
+ else:
+ title = fuzzy.nysiis(title)
+ except KeyError as err:
+ continue
+
+ print("%s\t%s" % (id, title))
+ print("%s\t%s" % (id, title), file=tf)
+
+ sbc = sort_by_column(tf.name, opts="-k 2")
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ print(json.dumps(doc).decode("utf-8"))
+
+ os.remove(sbc)
+ os.remove(tf.name)
+
+def main():
+ types = {
+ "title": cluster_by_title,
+ "title_normalized": cluster_by_title_normalized,
+ "title_nysiis": cluster_by_title_nysiis,
+ }
+ parser = argparse.ArgumentParser(prog='fuzzycat-cluster',
+ usage='%(prog)s [options]',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument("-t", "--type", default="title", help="clustering variant to use")
+ parser.add_argument("-l", "--list", action="store_true", help="list cluster variants")
+ parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
+ args = parser.parse_args()
+ if args.list:
+ print("\n".join(types.keys()))
+ return
+ func = types.get(args.type)
+ if func is None:
+ print("invalid type: {}".format(args.type))
+ return
+ func(args)
diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py
index 805e69e..07e4ad4 100644
--- a/fuzzycat/fatcat/main.py
+++ b/fuzzycat/fatcat/main.py
@@ -3,9 +3,11 @@
Command line entry point for ad-hoc testing.
"""
+import argparse
+
from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds
+
from fuzzycat.fatcat.matching import match_release_fuzzy
-import argparse
def main():
diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py
index ba0fef5..04ec275 100644
--- a/fuzzycat/fatcat/matching.py
+++ b/fuzzycat/fatcat/matching.py
@@ -15,16 +15,17 @@ Match methods return candidates, verify methods return a match status.
Candidate generation will use external data from search and hence is expensive. Verification is fast.
"""
-from typing import List, Optional, Union, Set
+from typing import List, Optional, Set, Union
import elasticsearch
from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
ReleaseExtIds, WorkEntity)
from fatcat_openapi_client.api.default_api import DefaultApi
-from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list, compare_ext_ids
-from fuzzycat.serials import serialsdb
from fuzzycat import cleanups
+from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list)
+from fuzzycat.serials import serialsdb
+
def match_container_fuzzy(container: ContainerEntity,
size: int = 5,
@@ -198,8 +199,7 @@ def verify_serial_name(a: str, b: str) -> MatchStatus:
Serial name verification. Serial names are a subset of container names.
There are about 2M serials.
"""
-
- def verify(a : Set[str], b : Set[str]) -> MatchStatus:
+ def verify(a: Set[str], b: Set[str]) -> MatchStatus:
# If any name yields multiple ISSN-L, we cannot decide.
if len(a) > 1: