diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-10-18 20:25:53 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-10-21 03:47:23 +0200 |
commit | e33a0f359dd36284c31eb619c6eddd617ef3a779 (patch) | |
tree | ce1b240455c20673118e0ec9cbb3167f67a25980 | |
parent | 26aa121848d41860a398cac8b549531e5f21f03e (diff) | |
download | fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.tar.gz fuzzycat-e33a0f359dd36284c31eb619c6eddd617ef3a779.zip |
cluster variants
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 163 | ||||
-rw-r--r-- | fuzzycat/fatcat/main.py | 4 | ||||
-rw-r--r-- | fuzzycat/fatcat/matching.py | 10 | ||||
-rw-r--r-- | notes/Workflow.md | 54 | ||||
-rw-r--r-- | setup.py | 1 |
6 files changed, 228 insertions, 8 deletions
@@ -16,8 +16,8 @@ deps: ## Install dependencies from setup.py into pipenv .PHONY: style style: ## Apply import sorting and black source formatting on all files - isort --atomic . - yapf -p -i -r fuzzycat + isort --atomic -rc fuzzycat/* + yapf -p -i -r fuzzycat/* yapf -p -i -r tests .PHONY: dist diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py new file mode 100644 index 0000000..94e42e3 --- /dev/null +++ b/fuzzycat/cluster.py @@ -0,0 +1,163 @@ +""" +Clustering part of matching. + +We want to have generic and fast way to derive various clusters. Input is a +json lines of release entities, e.g. from a database dump. + +Map and reduce. + +* input (json) blob -> (ident, value) -> group by value -> emit idents per group + +""" + +import argparse +import fileinput +import itertools +import json +import os +import subprocess +import tempfile +import re +import string + +import orjson as json +import fuzzy + +DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat") + + +def sort_by_column(filename, mode="w", opts="-k 2", fast=True): + """ + Sort tabular file with sort(1), returns the filename of the sorted file. + """ + with tempfile.NamedTemporaryFile(delete=False, mode=mode) as tf: + env = os.environ.copy() + if fast: + env["LC_ALL"] = "C" + subprocess.run(["sort"] + opts.split() + [filename], stdout=tf) + + return tf.name + +def group_by_column(filename, key=None, value=None, comment=""): + """ + Group a sorted file with given key function. Use another function to + extract the value. + """ + with open(filename) as f: + for k, g in itertools.groupby(f, key=key): + doc = { + "v": [value(v) for v in g], + "c": comment, + "k": k.strip(), + } + yield doc + +# XXX: LineOps + +def cut(f=0, sep='\t'): + """ + Similar to cut(1), but zero indexed. + """ + return lambda v: v.split(sep)[f] + +def cluster_by_title(args): + """ + Basic example for a three stage process: extract, sort, group. Speed is + about: 20K/s (json roundtrip, sorting, grouping). + """ + with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: + for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): + doc = json.loads(line) + try: + id = doc["ident"] + title = doc["title"] + if not title: + continue + else: + title = title.replace("\t", " ").replace("\n", " ").strip() + except KeyError as err: + continue + print("%s\t%s" % (id, title), file=tf) + + sbc = sort_by_column(tf.name, opts="-k 2") + for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"): + print(json.dumps(doc).decode("utf-8")) + + os.remove(sbc) + os.remove(tf.name) + +def cluster_by_title_normalized(args): + """ + Normalize title, e.g. analysisofheritability. 17k/s. + """ + pattern = re.compile('[\W_]+', re.UNICODE) + with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: + for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): + doc = json.loads(line) + try: + id = doc["ident"] + title = doc["title"] + if not title: + continue + else: + title = title.replace("\t", " ").replace("\n", " ").strip().lower() + title = pattern.sub('', title) + except KeyError as err: + continue + print("%s\t%s" % (id, title), file=tf) + + sbc = sort_by_column(tf.name, opts="-k 2") + for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"): + print(json.dumps(doc).decode("utf-8")) + + os.remove(sbc) + os.remove(tf.name) + +def cluster_by_title_nysiis(args): + """ + Soundex on title. + """ + with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: + for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): + doc = json.loads(line) + try: + id = doc["ident"] + title = doc["title"] + if not title: + continue + else: + title = fuzzy.nysiis(title) + except KeyError as err: + continue + + print("%s\t%s" % (id, title)) + print("%s\t%s" % (id, title), file=tf) + + sbc = sort_by_column(tf.name, opts="-k 2") + for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"): + print(json.dumps(doc).decode("utf-8")) + + os.remove(sbc) + os.remove(tf.name) + +def main(): + types = { + "title": cluster_by_title, + "title_normalized": cluster_by_title_normalized, + "title_nysiis": cluster_by_title_nysiis, + } + parser = argparse.ArgumentParser(prog='fuzzycat-cluster', + usage='%(prog)s [options]', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-t", "--type", default="title", help="clustering variant to use") + parser.add_argument("-l", "--list", action="store_true", help="list cluster variants") + parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used') + args = parser.parse_args() + if args.list: + print("\n".join(types.keys())) + return + func = types.get(args.type) + if func is None: + print("invalid type: {}".format(args.type)) + return + func(args) diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py index 805e69e..07e4ad4 100644 --- a/fuzzycat/fatcat/main.py +++ b/fuzzycat/fatcat/main.py @@ -3,9 +3,11 @@ Command line entry point for ad-hoc testing. """ +import argparse + from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds + from fuzzycat.fatcat.matching import match_release_fuzzy -import argparse def main(): diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py index ba0fef5..04ec275 100644 --- a/fuzzycat/fatcat/matching.py +++ b/fuzzycat/fatcat/matching.py @@ -15,16 +15,17 @@ Match methods return candidates, verify methods return a match status. Candidate generation will use external data from search and hence is expensive. Verification is fast. """ -from typing import List, Optional, Union, Set +from typing import List, Optional, Set, Union import elasticsearch from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, ReleaseExtIds, WorkEntity) from fatcat_openapi_client.api.default_api import DefaultApi -from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list, compare_ext_ids -from fuzzycat.serials import serialsdb from fuzzycat import cleanups +from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list) +from fuzzycat.serials import serialsdb + def match_container_fuzzy(container: ContainerEntity, size: int = 5, @@ -198,8 +199,7 @@ def verify_serial_name(a: str, b: str) -> MatchStatus: Serial name verification. Serial names are a subset of container names. There are about 2M serials. """ - - def verify(a : Set[str], b : Set[str]) -> MatchStatus: + def verify(a: Set[str], b: Set[str]) -> MatchStatus: # If any name yields multiple ISSN-L, we cannot decide. if len(a) > 1: diff --git a/notes/Workflow.md b/notes/Workflow.md new file mode 100644 index 0000000..abf0d76 --- /dev/null +++ b/notes/Workflow.md @@ -0,0 +1,54 @@ +# Workflow + +Separate problem in half, first find clusters, then examine clusters (as +proposed). + +## Finding clusters + +* group by raw exact title +* group by lowercase title +* group by slug title +* group by ngram title and authors +* group by ngram title (prefix, suffix) and authors +* group by elasticsearch +* group by doi without vX prefix +* group by soundex +* group by a simhash over the record + +As for performance, the feature needs to be calculated in one pass, then the +grouping reduces to a sort, in a second pass. + +The output could be a TSV file, with method and then release identifiers. + +``` +rawt o3utonw5qzhddo7l4lmwptgeey nnpmnwln7be2zb5hd2qanq3r7q +``` + +Or jsonlines for a bit of structure. + +``` +{"m": "rawt", "c": ["o3utonw5qzhddo7l4lmwptgeey", "nnpmnwln7be2zb5hd2qanq3r7q"]} +``` + +``` +$ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -g > clusters.json +``` + +### Performance considerations + +* [orjson](https://github.com/ijl/orjson), [pysimdjson](https://github.com/TkTech/pysimdjson) + + +## Examine cluster + +There will be various methods by which to examine the cluster as well. + +We need to fetch releases by identifier, this can be the full record or some +partial record that has been cached somewhere. + +The input is then a list of releases and the output would be a equally sized or +smaller cluster of releases which we assume represent the same record. + +Apart from that, there may be different relations, e.g. not the exact same +thing, but something, that has an interval to it, like some thing that mostly +differs in year? @@ -25,6 +25,7 @@ with open("README.md", "r") as fh: entry_points={"console_scripts": [ "fuzzycat=fuzzycat.fatcat.main:main", "fuzzycat-issn=fuzzycat.issn:main", + "fuzzycat-cluster=fuzzycat.cluster:main", ],}, install_requires=[ "fatcat-openapi-client", |