start larger refactoring: remove cluster

background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust.
author: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
commit: 478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree: fa467290e8c8df41a1e97a6de751d0f7e790c9de
parent: 86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download: fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz
fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip
9 files changed, 188 insertions, 723 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 10c856d..7792df6 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -3,23 +3,14 @@
 
 COMMANDS
 
-    cluster
     verify
     verify_single
     verify_ref
     release_match
     unstructured
 
-  Run, e.g. fuzzycat cluster --help for more options.
-
 EXAMPLES
 
-  Clustering with GNU parallel.
-
-      $ zstdcat -T0 release_export_expanded.json.zst |
-          parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
-          python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
-
   Bulk verification.
 
       $ zstdcat -T0 cluster_tsandcrawler.json.zst |
@@ -67,9 +58,6 @@ import elasticsearch
 import requests
 from fatcat_openapi_client import ReleaseEntity
 
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
-                              release_key_title_normalized, release_key_title_nysiis,
-                              release_key_title_sandcrawler)
 from fuzzycat.entities import entity_to_dict
 from fuzzycat.grobid_unstructured import grobid_parse_unstructured
 from fuzzycat.matching import anything_to_entity, match_release_fuzzy
@@ -82,32 +70,6 @@ logging.getLogger("requests").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 
 
-def run_cluster(args):
-    """
-    Run clustering over release entities from database dump.
-    """
-    logger = logging.getLogger('main.run_cluster')
-    types = {
-        'title': release_key_title,
-        'tnorm': release_key_title_normalized,
-        'tnysi': release_key_title_nysiis,
-        'tss': release_key_title_ngram,
-        'tsandcrawler': release_key_title_sandcrawler,
-    }
-    key_denylist = None
-    if args.key_denylist:
-        with open(args.key_denylist, 'r') as f:
-            key_denylist = [l.strip() for l in f.readlines()]
-    cluster = Cluster(iterable=fileinput.input(args.files),
-                      key=types.get(args.type),
-                      tmpdir=args.tmpdir,
-                      compress=args.compress,
-                      key_denylist=key_denylist,
-                      prefix=args.prefix)
-    cluster.run()
-    logger.debug(json.dumps(dict(cluster.counter)))
-
-
 def run_verify(args):
     """
     Run match verification over dataset from clustering step.
@@ -253,23 +215,6 @@ if __name__ == '__main__':
     parser.add_argument("-v", "--verbose", help="be verbose", action='store_true')
     subparsers = parser.add_subparsers()
 
-    sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
-    sub_cluster.set_defaults(func=run_cluster)
-    sub_cluster.add_argument('-C',
-                             '--compress',
-                             action="store_true",
-                             help='compress intermediate results')
-    sub_cluster.add_argument('-f', '--files', default="-", help='input files')
-    sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
-    sub_cluster.add_argument('--min-cluster-size',
-                             default=2,
-                             type=int,
-                             help='ignore smaller clusters')
-    sub_cluster.add_argument('-t',
-                             '--type',
-                             default='title',
-                             help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler')
-
     sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
     sub_verify.add_argument('-f', '--files', default="-", help='input files')
     sub_verify.add_argument('--max-cluster-size',
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
deleted file mode 100644
index 7994be7..0000000
--- a/fuzzycat/cluster.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# pylint: disable=C0103
-"""
-Clustering stage.
-
-* [x] verify needs whole document
-* [ ] parallelization misses groups
-* [ ] cached match key store (tsv, sqlite3), something ~/.cache/...
-* [x] reproducibly run tests
-* [x] place for put md/tsv record tests
-
-----
-
-* [x] hadoop -> py (bn)
-* [ ] gnu parallel, share command line -- note (bn)
-
-----
-
-Ideas:
-
-* lookup potential matches; TSV [key, ...]; sort
-* maybe new "schema" - size vs "common schema" -- key <TAB> {"bibjson": ...}
-* merge-join
-
-```
-$ python -m fuzzycat keygen -s "algo" < ours | sort -k1,1 > a.tsv
-$ python -m fuzzycat keygen -s "algo" < other | sort -k1,1 > b.tsv
-$ merge-join a.tsv b.tsv
-```
-
-A couple of "keygen" algos.
-
-> 10k/s, 1B, ~day
-
-Partial fields should be ok.
-
-Q:
-
-* nysiis
-
-Deps.
-
-* pydantic; json "omitempty" -- get rid of it?
-* orjson (serialize datetime) -- maybe enough w/ dataclasses w/ dataclasses
-
-fuzzycat.main -> `__main__.py`
-
-* elasticsearch-py >> elasticsearch
-
-Matching releases to non-release entities.
-
-----
-
-Features and integration.
-
-* work grouping at import time; random pdfs; requires strong verification (vs refcat)
-* email out to OCI
-
-"""
-
-import collections
-import itertools
-import json
-import multiprocessing
-import operator
-import os
-import re
-import subprocess
-import sys
-import tempfile
-import unicodedata
-from dataclasses import dataclass
-from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
-
-import jellyfish
-import regex
-import zstandard
-
-from fuzzycat.utils import cut, shellout, slugify_string, zstdlines
-
-__all__ = [
-    "release_key_title",
-    "release_key_title_normalized",
-    "release_key_title_nysiis",
-    "release_key_title_sandcrawler",
-    "Cluster",
-]
-
-
-@dataclass
-class KeyDoc:
-    """
-    A document from which we can derive a key, e.g. a release entity.
-    """
-    ident: str
-    title: str
-
-
-get_ident_title = operator.itemgetter("ident", "title")
-ws_replacer = str.maketrans({"\t": " ", "\n": " "})
-non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-
-# Notes: untie from release_entity, as we are only using a few fields. Maybe
-# it's a jsob blob, with a pydantic spec and schema.
-
-
-def release_key_title(doc: KeyDoc) -> Tuple[str, str]:
-    ident, title = get_ident_title(doc)
-    if not title:
-        raise ValueError('title missing for {}'.format(ident))
-    title = title.translate(ws_replacer).strip()
-    return (ident, title)
-
-
-def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
-    ident, title = release_key_title(doc)
-    title = re.sub(r'[ ]{2,}', ' ', title).lower()
-    return (ident, non_word_re.sub('', title))
-
-
-def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
-    """
-    Use NYSIIS New York State Identification and Intelligence System.
-    """
-    ident, title = release_key_title(doc)
-    return (ident, jellyfish.nysiis(title))
-
-
-# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
-SANDCRAWLER_CHAR_MAP = {
-    '\N{Latin capital letter AE}': 'AE',
-    '\N{Latin small letter ae}': 'ae',
-    '\N{Latin capital letter Eth}': 'D',
-    '\N{Latin small letter eth}': 'd',
-    '\N{Latin capital letter O with stroke}': 'O',
-    '\N{Latin small letter o with stroke}': 'o',
-    '\N{Latin capital letter Thorn}': 'Th',
-    '\N{Latin small letter thorn}': 'th',
-    '\N{Latin small letter sharp s}': 's',
-    '\N{Latin capital letter D with stroke}': 'D',
-    '\N{Latin small letter d with stroke}': 'd',
-    '\N{Latin capital letter H with stroke}': 'H',
-    '\N{Latin small letter h with stroke}': 'h',
-    '\N{Latin small letter dotless i}': 'i',
-    '\N{Latin small letter kra}': 'k',
-    '\N{Latin capital letter L with stroke}': 'L',
-    '\N{Latin small letter l with stroke}': 'l',
-    '\N{Latin capital letter Eng}': 'N',
-    '\N{Latin small letter eng}': 'n',
-    '\N{Latin capital ligature OE}': 'Oe',
-    '\N{Latin small ligature oe}': 'oe',
-    '\N{Latin capital letter T with stroke}': 'T',
-    '\N{Latin small letter t with stroke}': 't',
-
-    # bnewbold additions; mostly Latin-ish OCR ambiguous
-    '\N{MICRO SIGN}': 'u',
-    '\N{LATIN SMALL LETTER C}': 'c',
-    '\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
-    '\N{Greek Small Letter Alpha}': 'a',
-    '\N{Greek Small Letter Beta}': 'b',
-    '\N{Greek Small Letter Iota}': 'i',
-    '\N{Greek Small Letter Kappa}': 'k',
-    '\N{Greek Small Letter Chi}': 'x',
-    '\N{Greek Small Letter Upsilon}': 'u',
-    '\N{Greek Small Letter Nu}': 'v',
-    '\N{Greek Small Letter Gamma}': 'y',
-    '\N{Greek Small Letter Tau}': 't',
-    '\N{Greek Small Letter Omicron}': 'o',
-    # bnewbold map-to-null (for non-printing stuff not in the regex)
-    '\N{PARTIAL DIFFERENTIAL}': '',
-    '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
-    '\N{N-ARY SUMMATION}': '',
-    '\N{N-ARY PRODUCT}': '',
-    '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '',
-    '\N{SNOWMAN}': '',
-    '\N{CARON}': '',
-}
-
-SANDCRAWLER_PREFIX_REMOVE = [
-    "original article: ",
-    "original article ",
-    "article: ",
-    "title: ",
-]
-
-# regex that matches all characters which should be removed
-SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
-    r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
-)
-
-
-def sandcrawler_slugify(raw: str) -> str:
-    """
-    Python re-implementation of sandcrawler Scala code for string comparison
-    ("scorable" strings)
-    """
-    slug = raw.strip().lower()
-
-    # transforms before running regex
-    for prefix in SANDCRAWLER_PREFIX_REMOVE:
-        if slug.startswith(prefix):
-            slug = slug[:len(prefix)]
-
-    slug = slug.replace("&apos;", "'")
-
-    # iterate over all chars and replace from map, if in map; then lower-case again
-    slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower()
-
-    # early bailout before executing regex
-    if not slug:
-        return ""
-
-    slug = unicodedata.normalize('NFKD', slug)
-    slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
-
-    return slug.lower()
-
-
-def test_sandcrawler_slugify() -> None:
-    test_cases = [
-        ("", ""),
-        ("asdf", "asdf"),
-        ("'Hello World!'", "helloworld"),
-        ("ASDF", "asdf"),
-        ("as\n  df", "asdf"),
-        ("as\u0142  bb \u00f8", "aslbbo"),
-        ("`hello¿", "hello"),
-        ("علمية", "علمية"),
-        ("期刊的数字", "期刊的数字"),
-        ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
-        ("γ-Globulin", "yglobulin"),
-
-        # "MICRO SIGN"
-        ("\xb5meter", "umeter"),
-        # "GREEK SMALL LETTER MU"
-        ("\u03bcmeter", "\u03bcmeter"),
-
-        # TODO: ("salt &and; pepper", "saltpepper"),
-        # TODO: ("new <b>and</b> improved", "newandimproved"),
-
-        # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
-        ("-9223372036854775808/-1", "92233720368547758081"),
-        (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""),
-        (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000",
-         ""),
-        (r"Ω≈ç√∫˜≤≥÷", "ωc"),
-        (r"åß∂ƒ©˙∆˚¬…æ", "asfae"),
-        (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"),
-        (r"¡™£¢∞§¶•ªº–≠ ", "tmao"),
-        (r"¸˛Ç◊ı˜Â¯˘¿", "cia"),
-        (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"),
-        (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"),
-        (r"`⁄€‹›ﬁﬂ‡°·‚—±", "fifl"),
-        (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
-         "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"),
-        (r"⁰⁴⁵₀₁₂", "045012"),
-        (r"社會科學院語學研究所", "社會科學院語學研究所"),
-        # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
-        # TODO: ("表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀", "表ポあa鷗oeebＢ逍usaan丂㐀𠀀"),
-        (r"( ͡° ͜ʖ ͡°)", ""),
-        # emoji ok? I guess
-        (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
-        (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
-        (r"﷽ ", "﷽"),
-        (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
-         "thenezperdianhivemindofchaoszalgo"),
-        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
-        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
-        (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
-    ]
-
-    for in_str, out_str in test_cases:
-        if sandcrawler_slugify(in_str) != out_str:
-            for c in list(sandcrawler_slugify(in_str)):
-                try:
-                    print(unicodedata.name(c))
-                except ValueError:
-                    print(ord(c))
-                #print(ord(c))
-            print("----")
-            for c in list(out_str):
-                print(unicodedata.name(c))
-            print(in_str)
-        assert sandcrawler_slugify(in_str) == out_str
-
-
-def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]:
-    ident, title = release_key_title(doc)
-    slug = sandcrawler_slugify(title)
-    return (ident, slug)
-
-
-def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
-    """
-    Derive a key from title.
-
-    Tokenize title, remote stopwords, lookup first three, lookup last three,
-    plus authors. TODO(miku): authors.
-    """
-    ident, title = get_ident_title(doc)
-    slug_title = slugify_string(title)
-    tokens = slug_title.split()
-    if len(tokens) < 2 * n:
-        key = ''.join(tokens)
-    else:
-        key = ''.join(tokens[:3] + tokens[-3:])
-    return (ident, key)
-
-
-class Cluster:
-    """
-    Setup and run clustering over a potentially large (100m) number of records.
-
-    Two main options are iterable (TODO: work on parsed docs), and the key
-    function to apply to value to group by.
-
-    TODO: We want compression.
-    """
-    def __init__(self,
-                 iterable: collections.abc.Iterable,
-                 key: Callable[[Any], Tuple[str, str]],
-                 output: IO[str] = sys.stdout,
-                 key_denylist: Optional[List[str]] = None,
-                 prefix: str = "fuzzycat-",
-                 tmpdir: str = tempfile.gettempdir(),
-                 strict: bool = False,
-                 min_cluster_size: int = 2,
-                 max_cluster_size: int = 100,
-                 compress=False,
-                 verbose=True):
-        self.iterable: collections.abc.Iterable = iterable
-        self.key: Callable[[Any], Tuple[str, str]] = key
-        self.output: IO[str] = output
-        self.prefix: str = prefix
-        self.tmpdir: str = tmpdir
-        self.strict = strict
-        self.key_denylist = key_denylist
-        self.min_cluster_size = min_cluster_size
-        self.max_cluster_size = max_cluster_size
-        self.verbose = verbose
-        self.compress = compress
-        self.counter: Dict[str, int] = collections.Counter({
-            "key_fail": 0,
-            "key_ok": 0,
-            "key_empty": 0,
-            "key_denylist": 0,
-            "num_clusters": 0,
-        })
-
-    def run(self):
-        """
-        First map documents to keys, then group by keys, outline: json -> tsv
-        -> sort -> group -> json.
-        """
-        with tempfile.NamedTemporaryFile(delete=False, mode="wb", prefix=self.prefix) as tf:
-            if self.compress:
-                zc = zstandard.ZstdCompressor(level=9, threads=multiprocessing.cpu_count())
-                writer = zc.stream_writer(tf)
-            else:
-                writer = tf
-            for i, line in enumerate(self.iterable):
-                if self.verbose and i % 100000 == 0:
-                    print("@{}".format(i), file=sys.stderr)
-                try:
-                    doc = json.loads(line)
-                    id, key = self.key(doc)
-                except (KeyError, ValueError):
-                    if self.strict:
-                        raise
-                    self.counter["key_fail"] += 1
-                    continue
-                if not key:
-                    self.counter["key_empty"] += 1
-                    continue
-                if self.key_denylist and key in self.key_denylist:
-                    self.counter["key_denylist"] += 1
-                    continue
-                self.counter["key_ok"] += 1
-                # XXX: if the line itself contains tabs, we need to remove
-                # them here; maybe offer TSV and JSON output and extra flag
-                # XXX: this needs to be compressed (e.g. with 2B records, we
-                # fill up disk too quickly)
-                data = bytes("{}\t{}\t{}\n".format(id, key,
-                                                   line.replace("\t", " ").strip()),
-                             encoding="utf-8")
-                writer.write(data)
-            if self.compress:
-                writer.flush(zstandard.FLUSH_FRAME)
-
-        sf = self.sort(tf.name, opts='-k 2')
-        if self.compress:
-            f = zstdlines(sf)
-        else:
-            f = open(sf)
-
-        for doc in self.group_by(f, key=cut(f=1)):
-            if len(doc["v"]) < self.min_cluster_size:
-                continue
-            self.counter["num_clusters"] += 1
-            json.dump(doc, self.output)
-            self.output.write("\n")
-
-        os.remove(sf)
-        os.remove(tf.name)
-        return self.counter
-
-    def sort(self, filename: str, opts: str = "-k 2", fast: bool = True, mode: str = "w"):
-        """
-        Sort tabular file with sort(1), returns the filename of the sorted
-        file. Options to sort can be passed in via opts keyword argument.
-        """
-        with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=self.prefix) as tf:
-            env = os.environ.copy()
-            env["TMPDIR"] = self.tmpdir
-            if fast:
-                env["LC_ALL"] = "C"
-            if self.compress:
-                output = shellout(
-                    "zstdcat -T0 {input} | LC_ALL=C TMPDIR={tmpdir} sort {opts} | zstd -T0 -c9 > {output}",
-                    input=filename,
-                    tmpdir=self.tmpdir,
-                    opts=opts)
-            else:
-                subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True)
-                output = tf.name
-
-        return output
-
-    def group_by(self,
-                 seq: collections.abc.Iterable,
-                 key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
-        """
-        Extract a key from elements of an iterable and group them. Just as
-        uniq(1), the input iterable must be ordered (by the key that is
-        extracted) for this to work.
-
-        There might be large clusters, which would currently exceed memory,
-        hence the max_cluster_size option.
-        """
-        for k, g in itertools.groupby(seq, key=key):
-            payload = []
-            for i, line in enumerate(g):
-                if i > 0 and i == self.max_cluster_size:
-                    print('max cluster size cut off for: {}'.format(k), file=sys.stderr)
-                    break
-                # XXX: This is a bit too much "serde", get rid of this.
-                fields = line.split("\t")
-                if len(fields) < 3:
-                    continue
-                payload.append(json.loads(fields[2]))
-            doc = {
-                "k": k.strip(),
-                "v": payload,
-            }
-            yield doc
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index 310dfc2..bcda46d 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -73,7 +73,6 @@ def match_release_fuzzy(
         if r:
             return [r]
 
-
     if release.title is not None and release.contribs is not None:
         names = " ".join([c.raw_name for c in release.contribs])
         body = {
@@ -178,7 +177,6 @@ def match_release_fuzzy(
     if es_compat_hits_total(resp) > 0:
         return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
 
-
     # TODO: perform more queries on other fields.
     return []
 
diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py
new file mode 100644
index 0000000..958756a
--- /dev/null
+++ b/fuzzycat/sandcrawler.py
@@ -0,0 +1,158 @@
+import regex
+import unicodedata
+
+# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
+SANDCRAWLER_CHAR_MAP = {
+    '\N{Latin capital letter AE}': 'AE',
+    '\N{Latin small letter ae}': 'ae',
+    '\N{Latin capital letter Eth}': 'D',
+    '\N{Latin small letter eth}': 'd',
+    '\N{Latin capital letter O with stroke}': 'O',
+    '\N{Latin small letter o with stroke}': 'o',
+    '\N{Latin capital letter Thorn}': 'Th',
+    '\N{Latin small letter thorn}': 'th',
+    '\N{Latin small letter sharp s}': 's',
+    '\N{Latin capital letter D with stroke}': 'D',
+    '\N{Latin small letter d with stroke}': 'd',
+    '\N{Latin capital letter H with stroke}': 'H',
+    '\N{Latin small letter h with stroke}': 'h',
+    '\N{Latin small letter dotless i}': 'i',
+    '\N{Latin small letter kra}': 'k',
+    '\N{Latin capital letter L with stroke}': 'L',
+    '\N{Latin small letter l with stroke}': 'l',
+    '\N{Latin capital letter Eng}': 'N',
+    '\N{Latin small letter eng}': 'n',
+    '\N{Latin capital ligature OE}': 'Oe',
+    '\N{Latin small ligature oe}': 'oe',
+    '\N{Latin capital letter T with stroke}': 'T',
+    '\N{Latin small letter t with stroke}': 't',
+
+    # bnewbold additions; mostly Latin-ish OCR ambiguous
+    '\N{MICRO SIGN}': 'u',
+    '\N{LATIN SMALL LETTER C}': 'c',
+    '\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
+    '\N{Greek Small Letter Alpha}': 'a',
+    '\N{Greek Small Letter Beta}': 'b',
+    '\N{Greek Small Letter Iota}': 'i',
+    '\N{Greek Small Letter Kappa}': 'k',
+    '\N{Greek Small Letter Chi}': 'x',
+    '\N{Greek Small Letter Upsilon}': 'u',
+    '\N{Greek Small Letter Nu}': 'v',
+    '\N{Greek Small Letter Gamma}': 'y',
+    '\N{Greek Small Letter Tau}': 't',
+    '\N{Greek Small Letter Omicron}': 'o',
+    # bnewbold map-to-null (for non-printing stuff not in the regex)
+    '\N{PARTIAL DIFFERENTIAL}': '',
+    '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+    '\N{N-ARY SUMMATION}': '',
+    '\N{N-ARY PRODUCT}': '',
+    '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '',
+    '\N{SNOWMAN}': '',
+    '\N{CARON}': '',
+}
+
+SANDCRAWLER_PREFIX_REMOVE = [
+    "original article: ",
+    "original article ",
+    "article: ",
+    "title: ",
+]
+
+# regex that matches all characters which should be removed
+SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
+    r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
+)
+
+def sandcrawler_slugify(raw: str) -> str:
+    """
+    Python re-implementation of sandcrawler Scala code for string comparison
+    ("scorable" strings)
+    """
+    slug = raw.strip().lower()
+
+    # transforms before running regex
+    for prefix in SANDCRAWLER_PREFIX_REMOVE:
+        if slug.startswith(prefix):
+            slug = slug[:len(prefix)]
+
+    slug = slug.replace("&apos;", "'")
+
+    # iterate over all chars and replace from map, if in map; then lower-case again
+    slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower()
+
+    # early bailout before executing regex
+    if not slug:
+        return ""
+
+    slug = unicodedata.normalize('NFKD', slug)
+    slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
+
+    return slug.lower()
+
+
+def test_sandcrawler_slugify() -> None:
+    test_cases = [
+        ("", ""),
+        ("asdf", "asdf"),
+        ("'Hello World!'", "helloworld"),
+        ("ASDF", "asdf"),
+        ("as\n  df", "asdf"),
+        ("as\u0142  bb \u00f8", "aslbbo"),
+        ("`hello¿", "hello"),
+        ("علمية", "علمية"),
+        ("期刊的数字", "期刊的数字"),
+        ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+        ("γ-Globulin", "yglobulin"),
+
+        # "MICRO SIGN"
+        ("\xb5meter", "umeter"),
+        # "GREEK SMALL LETTER MU"
+        ("\u03bcmeter", "\u03bcmeter"),
+
+        # TODO: ("salt &and; pepper", "saltpepper"),
+        # TODO: ("new <b>and</b> improved", "newandimproved"),
+
+        # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+        ("-9223372036854775808/-1", "92233720368547758081"),
+        (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""),
+        (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000",
+         ""),
+        (r"Ω≈ç√∫˜≤≥÷", "ωc"),
+        (r"åß∂ƒ©˙∆˚¬…æ", "asfae"),
+        (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"),
+        (r"¡™£¢∞§¶•ªº–≠ ", "tmao"),
+        (r"¸˛Ç◊ı˜Â¯˘¿", "cia"),
+        (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"),
+        (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"),
+        (r"`⁄€‹›ﬁﬂ‡°·‚—±", "fifl"),
+        (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
+         "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"),
+        (r"⁰⁴⁵₀₁₂", "045012"),
+        (r"社會科學院語學研究所", "社會科學院語學研究所"),
+        # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
+        # TODO: ("表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀", "表ポあa鷗oeebＢ逍usaan丂㐀𠀀"),
+        (r"( ͡° ͜ʖ ͡°)", ""),
+        # emoji ok? I guess
+        (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+        (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+        (r"﷽ ", "﷽"),
+        (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+         "thenezperdianhivemindofchaoszalgo"),
+        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+    ]
+
+    for in_str, out_str in test_cases:
+        if sandcrawler_slugify(in_str) != out_str:
+            for c in list(sandcrawler_slugify(in_str)):
+                try:
+                    print(unicodedata.name(c))
+                except ValueError:
+                    print(ord(c))
+            print("----")
+            for c in list(out_str):
+                print(unicodedata.name(c))
+            print(in_str)
+        assert sandcrawler_slugify(in_str) == out_str
+
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 303daf6..24e103a 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -26,12 +26,12 @@ def es_compat_hits_total(resp):
     https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
 
     It is responsibility of the call site to set `track_total_hits` in ES7 to
-    get an exact number.
+    get an exact number (https://www.elastic.co/guide/en/elasticsearch/reference/master/search-your-data.html#track-total-hits).
     """
     try:
-        return resp["hits"]["total"]["value"]
+        return resp["hits"]["total"]["value"]  # ES7
     except TypeError:
-        return resp["hits"]["total"]
+        return resp["hits"]["total"]  # ES6
 
 
 def parse_page_string(s):
@@ -44,6 +44,8 @@ def parse_page_string(s):
 
     Does not handle lists of page numbers, roman numerals, and several other
     patterns.
+
+    Returns a named tuple with start, end and count fields.
     """
     if not s:
         raise ValueError('page parsing: empty string')
@@ -69,7 +71,7 @@ def parse_page_string(s):
     return ParsedPages(start=a, end=b, count=count)
 
 
-def dict_key_exists(doc, path):
+def dict_has_key(doc, path):
     """
     Return true, if key in a dictionary at a given path exists. XXX: probably
     already in glom.
@@ -101,7 +103,10 @@ def doi_prefix(v):
     """
     Return the prefix of a DOI.
     """
-    return v.split("/")[0]
+    parts = v.split("/")
+    if len(parts) == 1:
+        raise ValueError("invalid doi: {}".format(v))
+    return parts[0]
 
 
 def has_doi_prefix(v, prefix="10.1234"):
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 5b90c47..9eb808b 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -91,7 +91,7 @@ from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_
                            TITLE_FRAGMENT_BLACKLIST)
 from fuzzycat.entities import entity_to_dict
 from fuzzycat.utils import (author_similarity_score, clean_doi, contains_chemical_formula,
-                            dict_key_exists, doi_prefix, has_doi_prefix, jaccard, num_project,
+                            dict_has_key, doi_prefix, has_doi_prefix, jaccard, num_project,
                             parse_page_string, slugify_string)
 
 Verify = collections.namedtuple("Verify", "status reason")
@@ -233,10 +233,10 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
         if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
             if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
                 return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED)
-            if a_title == b_title and ((dict_key_exists(a, "extra.subtitle")
-                                        and not dict_key_exists(b, "extra.subtitle")) or
-                                       (dict_key_exists(b, "extra.subtitle")
-                                        and not dict_key_exists(a, "extra.subtitle"))):
+            if a_title == b_title and ((dict_has_key(a, "extra.subtitle")
+                                        and not dict_has_key(b, "extra.subtitle")) or
+                                       (dict_has_key(b, "extra.subtitle")
+                                        and not dict_has_key(a, "extra.subtitle"))):
                 return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC)
     except PathAccessError:
         pass
@@ -301,7 +301,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     # beware: we have versions and "isPartOf", e.g.
     # https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
     # Datacite md schema: https://doi.org/10.14454/7xq3-zf69
-    if dict_key_exists(a, "extra.datacite") and dict_key_exists(b, "extra.datacite"):
+    if dict_has_key(a, "extra.datacite") and dict_has_key(b, "extra.datacite"):
         whitelist = set([
             "HasPart",
             "HasVersion",
@@ -511,8 +511,8 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     # if any([a_authors, b_authors]) and not (a_authors and b_authors):
     # Does not cover case, where both authors are empty.
     if a_release_year == b_release_year and a_title_lower == b_title_lower:
-        if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi"))
-                or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))):
+        if ((dict_has_key(a, "ext_ids.pmid") and dict_has_key(b, "ext_ids.doi"))
+                or (dict_has_key(b, "ext_ids.pmid") and dict_has_key(a, "ext_ids.doi"))):
             return Verify(Status.STRONG, Reason.PMID_DOI_PAIR)
 
     # Two JSTOR items will probably be different.
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index 55b349a..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections
-import io
-import json
-import os
-import tempfile
-
-import pytest
-
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
-                              release_key_title_nysiis)
-
-Case = collections.namedtuple("Case", 'input output')
-
-
-def test_release_key_title():
-    with pytest.raises(KeyError):
-        release_key_title({})
-    with pytest.raises(KeyError, match='title'):
-        release_key_title({'ident': '123'})
-    with pytest.raises(KeyError, match='ident'):
-        release_key_title({'title': 'deep learning backdoor'})
-    with pytest.raises(ValueError, match='title.*missing'):
-        release_key_title({'ident': '', 'title': ''})
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'Simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'Sim  hash')),
-    )
-    for case in cases:
-        assert case.output == release_key_title(case.input)
-
-
-def test_release_key_title_normalized():
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'THE year 1929'
-        }, output=('', 'theyear1929')),
-        Case(input={
-            'ident': '',
-            'title': '2019?'
-        }, output=('', '2019')),
-        Case(input={
-            'ident': '123',
-            'title': 'H~~2019?'
-        }, output=('123', 'h2019')),
-    )
-    for case in cases:
-        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
-            case.input)
-
-
-def test_release_key_title_nysiis():
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'SANM')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'SANM')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'SAN')),
-        Case(input={
-            'ident': '',
-            'title': 'THE year 1929'
-        }, output=('', 'T')),
-        Case(input={
-            'ident': '',
-            'title': '2019?'
-        }, output=('', '2019?')),
-        Case(input={
-            'ident': '123',
-            'title': 'H~~2019?'
-        }, output=('123', 'H~2019?')),
-        Case(input={
-            'ident': '123',
-            'title': '世界'
-        }, output=('123', '世界')),
-    )
-    for case in cases:
-        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
-            case.input)
-
-
-def test_cluster():
-    sio = io.StringIO()
-    lines = [
-        json.dumps(doc) for doc in [
-            {
-                "title": "hello world",
-                "ident": 1,
-            },
-            {
-                "title": "hello world!",
-                "ident": 2,
-            },
-        ]
-    ]
-    cluster = Cluster(lines, release_key_title_normalized, output=sio)
-    stats = cluster.run()
-    assert stats == {
-        "key_fail": 0,
-        "key_ok": 2,
-        "key_empty": 0,
-        "key_denylist": 0,
-        "num_clusters": 1
-    }
-    assert json.loads(sio.getvalue()) == {
-        "k": "helloworld",
-        "v": [{
-            "title": "hello world!",
-            "ident": 2
-        }, {
-            "title": "hello world",
-            "ident": 1
-        }]
-    }
-
-    sio = io.StringIO()
-    cluster = Cluster([
-        json.dumps(line) for line in [
-            {
-                "title": "hello world",
-                "ident": 1
-            },
-            {
-                "title": "hello world!",
-                "ident": 2
-            },
-            {
-                "title": "other",
-                "ident": 3
-            },
-        ]
-    ],
-                      release_key_title_normalized,
-                      min_cluster_size=1,
-                      output=sio)
-    stats = cluster.run()
-    assert stats == {
-        "key_fail": 0,
-        "key_ok": 3,
-        "key_empty": 0,
-        "key_denylist": 0,
-        "num_clusters": 2
-    }
-    assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
-        "k":
-        "helloworld",
-        "v": [{
-            "title": "hello world!",
-            "ident": 2
-        }, {
-            "title": "hello world",
-            "ident": 1
-        }]
-    }, {
-        'k':
-        'other',
-        'v': [{
-            'ident': 3,
-            'title': 'other'
-        }]
-    }]
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 90d1fee..ad971a5 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog):
         }, 2),
         ({
             "title": "",
-            "contribs": [{"raw_name": "Aristoteles"}],
+            "contribs": [{
+                "raw_name": "Aristoteles"
+            }],
             "ext_ids": {}
         }, 5),
         # ({
@@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog):
         result = match_release_fuzzy(entity, es=es_client)
         with caplog.at_level(logging.INFO):
             logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
-                                                              [v.title for v in result]))
+                                                                      [v.title for v in result]))
         assert len(result) == count, doc
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 957203f..b2242b8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,7 @@ import pytest
 import os
 
 from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
-                            token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
+                            token_n_grams, tokenize_string, parse_page_string, dict_has_key,
                             zstdlines, es_compat_hits_total, clean_doi)
 
 
@@ -67,13 +67,13 @@ def test_nwise():
     assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
 
 
-def test_dict_key_exists():
-    assert dict_key_exists({}, "") is False
-    assert dict_key_exists({"a": "a"}, "a") == True
-    assert dict_key_exists({"a": "a"}, "b") == False
-    assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
-    assert dict_key_exists({"a": {"b": None}}, "a.b") == True
-    assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+def test_dict_has_key():
+    assert dict_has_key({}, "") is False
+    assert dict_has_key({"a": "a"}, "a") == True
+    assert dict_has_key({"a": "a"}, "b") == False
+    assert dict_has_key({"a": {"b": "c"}}, "a.b") == True
+    assert dict_has_key({"a": {"b": None}}, "a.b") == True
+    assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False
 
 
 def test_page_page_string():
author	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
commit	478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree	fa467290e8c8df41a1e97a6de751d0f7e790c9de
parent	86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download	fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip