cluster variants

author: Martin Czygan <martin.czygan@gmail.com> 2020-10-18 20:25:53 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-10-21 03:47:23 +0200
commit: e33a0f359dd36284c31eb619c6eddd617ef3a779 (patch)
tree: ce1b240455c20673118e0ec9cbb3167f67a25980
parent: 26aa121848d41860a398cac8b549531e5f21f03e (diff)
download: fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.tar.gz
fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.zip
6 files changed, 228 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index 5204a74..73b785a 100644
--- a/Makefile
+++ b/Makefile
@@ -16,8 +16,8 @@ deps: ## Install dependencies from setup.py into pipenv
 
 .PHONY: style
 style: ## Apply import sorting and black source formatting on all files
-	isort --atomic .
-	yapf -p -i -r fuzzycat
+	isort --atomic -rc fuzzycat/*
+	yapf -p -i -r fuzzycat/*
 	yapf -p -i -r tests
 
 .PHONY: dist
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
new file mode 100644
index 0000000..94e42e3
--- /dev/null
+++ b/fuzzycat/cluster.py
@@ -0,0 +1,163 @@
+"""
+Clustering part of matching.
+
+We want to have generic and fast way to derive various clusters. Input is a
+json lines of release entities, e.g. from a database dump.
+
+Map and reduce.
+
+* input (json) blob -> (ident, value) -> group by value -> emit idents per group
+
+"""
+
+import argparse
+import fileinput
+import itertools
+import json
+import os
+import subprocess
+import tempfile
+import re
+import string
+
+import orjson as json
+import fuzzy
+
+DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat")
+
+
+def sort_by_column(filename, mode="w", opts="-k 2", fast=True):
+    """
+    Sort tabular file with sort(1), returns the filename of the sorted file.
+    """
+    with tempfile.NamedTemporaryFile(delete=False, mode=mode) as tf:
+        env = os.environ.copy()
+        if fast:
+            env["LC_ALL"] = "C"
+        subprocess.run(["sort"] + opts.split() + [filename], stdout=tf)
+
+    return tf.name
+
+def group_by_column(filename, key=None, value=None, comment=""):
+    """
+    Group a sorted file with given key function. Use another function to
+    extract the value.
+    """
+    with open(filename) as f:
+        for k, g in itertools.groupby(f, key=key):
+            doc = {
+                "v": [value(v) for v in g],
+                "c": comment,
+                "k": k.strip(),
+            }
+            yield doc
+
+# XXX: LineOps
+
+def cut(f=0, sep='\t'):
+    """
+    Similar to cut(1), but zero indexed.
+    """
+    return lambda v: v.split(sep)[f]
+
+def cluster_by_title(args):
+    """
+    Basic example for a three stage process: extract, sort, group. Speed is
+    about: 20K/s (json roundtrip, sorting, grouping).
+    """
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+        for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+            doc = json.loads(line)
+            try:
+                id = doc["ident"]
+                title = doc["title"]
+                if not title:
+                    continue
+                else:
+                    title = title.replace("\t", " ").replace("\n", " ").strip()
+            except KeyError as err:
+                continue
+            print("%s\t%s" % (id, title), file=tf)
+
+    sbc = sort_by_column(tf.name, opts="-k 2")
+    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+        print(json.dumps(doc).decode("utf-8"))
+
+    os.remove(sbc)
+    os.remove(tf.name)
+
+def cluster_by_title_normalized(args):
+    """
+    Normalize title, e.g. analysisofheritability. 17k/s.
+    """
+    pattern = re.compile('[\W_]+', re.UNICODE)
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+        for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+            doc = json.loads(line)
+            try:
+                id = doc["ident"]
+                title = doc["title"]
+                if not title:
+                    continue
+                else:
+                    title = title.replace("\t", " ").replace("\n", " ").strip().lower()
+                    title = pattern.sub('', title)
+            except KeyError as err:
+                continue
+            print("%s\t%s" % (id, title), file=tf)
+
+    sbc = sort_by_column(tf.name, opts="-k 2")
+    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+        print(json.dumps(doc).decode("utf-8"))
+
+    os.remove(sbc)
+    os.remove(tf.name)
+
+def cluster_by_title_nysiis(args):
+    """
+    Soundex on title.
+    """
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+        for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
+            doc = json.loads(line)
+            try:
+                id = doc["ident"]
+                title = doc["title"]
+                if not title:
+                    continue
+                else:
+                    title = fuzzy.nysiis(title)
+            except KeyError as err:
+                continue
+
+            print("%s\t%s" % (id, title))
+            print("%s\t%s" % (id, title), file=tf)
+
+    sbc = sort_by_column(tf.name, opts="-k 2")
+    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+        print(json.dumps(doc).decode("utf-8"))
+
+    os.remove(sbc)
+    os.remove(tf.name)
+
+def main():
+    types = {
+        "title": cluster_by_title,
+        "title_normalized": cluster_by_title_normalized,
+        "title_nysiis": cluster_by_title_nysiis,
+    }
+    parser = argparse.ArgumentParser(prog='fuzzycat-cluster',
+                                     usage='%(prog)s [options]',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-t", "--type", default="title", help="clustering variant to use")
+    parser.add_argument("-l", "--list", action="store_true", help="list cluster variants")
+    parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
+    args = parser.parse_args()
+    if args.list:
+        print("\n".join(types.keys()))
+        return
+    func = types.get(args.type)
+    if func is None:
+        print("invalid type: {}".format(args.type))
+        return
+    func(args)
diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py
index 805e69e..07e4ad4 100644
--- a/fuzzycat/fatcat/main.py
+++ b/fuzzycat/fatcat/main.py
@@ -3,9 +3,11 @@
 Command line entry point for ad-hoc testing.
 """
 
+import argparse
+
 from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds
+
 from fuzzycat.fatcat.matching import match_release_fuzzy
-import argparse
 
 
 def main():
diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py
index ba0fef5..04ec275 100644
--- a/fuzzycat/fatcat/matching.py
+++ b/fuzzycat/fatcat/matching.py
@@ -15,16 +15,17 @@ Match methods return candidates, verify methods return a match status.
 Candidate generation will use external data from search and hence is expensive. Verification is fast.
 """
 
-from typing import List, Optional, Union, Set
+from typing import List, Optional, Set, Union
 
 import elasticsearch
 from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
                                    ReleaseExtIds, WorkEntity)
 from fatcat_openapi_client.api.default_api import DefaultApi
 
-from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list, compare_ext_ids
-from fuzzycat.serials import serialsdb
 from fuzzycat import cleanups
+from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list)
+from fuzzycat.serials import serialsdb
+
 
 def match_container_fuzzy(container: ContainerEntity,
                           size: int = 5,
@@ -198,8 +199,7 @@ def verify_serial_name(a: str, b: str) -> MatchStatus:
     Serial name verification. Serial names are a subset of container names.
     There are about 2M serials.
     """
-
-    def verify(a : Set[str], b : Set[str]) -> MatchStatus:
+    def verify(a: Set[str], b: Set[str]) -> MatchStatus:
 
         # If any name yields multiple ISSN-L, we cannot decide.
         if len(a) > 1:
diff --git a/notes/Workflow.md b/notes/Workflow.md
new file mode 100644
index 0000000..abf0d76
--- /dev/null
+++ b/notes/Workflow.md
@@ -0,0 +1,54 @@
+# Workflow
+
+Separate problem in half, first find clusters, then examine clusters (as
+proposed).
+
+## Finding clusters
+
+* group by raw exact title
+* group by lowercase title
+* group by slug title
+* group by ngram title and authors
+* group by ngram title (prefix, suffix) and authors
+* group by elasticsearch
+* group by doi without vX prefix
+* group by soundex
+* group by a simhash over the record
+
+As for performance, the feature needs to be calculated in one pass, then the
+grouping reduces to a sort, in a second pass.
+
+The output could be a TSV file, with method and then release identifiers.
+
+```
+rawt o3utonw5qzhddo7l4lmwptgeey nnpmnwln7be2zb5hd2qanq3r7q
+```
+
+Or jsonlines for a bit of structure.
+
+```
+{"m": "rawt", "c": ["o3utonw5qzhddo7l4lmwptgeey", "nnpmnwln7be2zb5hd2qanq3r7q"]}
+```
+
+```
+$ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -g > clusters.json
+```
+
+### Performance considerations
+
+* [orjson](https://github.com/ijl/orjson), [pysimdjson](https://github.com/TkTech/pysimdjson)
+
+
+## Examine cluster
+
+There will be various methods by which to examine the cluster as well.
+
+We need to fetch releases by identifier, this can be the full record or some
+partial record that has been cached somewhere.
+
+The input is then a list of releases and the output would be a equally sized or
+smaller cluster of releases which we assume represent the same record.
+
+Apart from that, there may be different relations, e.g. not the exact same
+thing, but something, that has an interval to it, like some thing that mostly
+differs in year?
diff --git a/setup.py b/setup.py
index 65e104b..f0f7f7b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@ with open("README.md", "r") as fh:
         entry_points={"console_scripts": [
             "fuzzycat=fuzzycat.fatcat.main:main",
             "fuzzycat-issn=fuzzycat.issn:main",
+            "fuzzycat-cluster=fuzzycat.cluster:main",
         ],},
         install_requires=[
             "fatcat-openapi-client",
author	Martin Czygan <martin.czygan@gmail.com>	2020-10-18 20:25:53 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-10-21 03:47:23 +0200
commit	e33a0f359dd36284c31eb619c6eddd617ef3a779 (patch)
tree	ce1b240455c20673118e0ec9cbb3167f67a25980
parent	26aa121848d41860a398cac8b549531e5f21f03e (diff)
download	fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.tar.gz fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.zip