2 files changed, 132 insertions, 3 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 9a8d5db..23aebbb 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -69,15 +69,18 @@ import string
 import subprocess
 import sys
 import tempfile
+import unicodedata
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
 
 import fuzzy
+import regex
 from pydantic import BaseModel
 
 __all__ = [
     "release_key_title",
     "release_key_title_normalized",
     "release_key_title_nysiis",
+    "release_key_title_sandcrawler",
     "sort_by_column",
     "group_by",
     "Cluster",
@@ -103,6 +106,7 @@ class KeyDoc(BaseModel):
     title: Optional[str]
     contribs: Optional[List[Contrib]]
 
+
 class ClusterResult(BaseModel):
     """
     Result of clustering.
@@ -155,6 +159,128 @@ def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
     return (ident, fuzzy.nysiis(title))
 
 
+# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
+SANDCRAWLER_CHAR_MAP = {
+    '\N{Latin capital letter AE}': 'AE',
+    '\N{Latin small letter ae}': 'ae',
+    '\N{Latin capital letter Eth}': 'D',
+    '\N{Latin small letter eth}': 'd',
+    '\N{Latin capital letter O with stroke}': 'O',
+    '\N{Latin small letter o with stroke}': 'o',
+    '\N{Latin capital letter Thorn}': 'Th',
+    '\N{Latin small letter thorn}': 'th',
+    '\N{Latin small letter sharp s}': 's',
+    '\N{Latin capital letter D with stroke}': 'D',
+    '\N{Latin small letter d with stroke}': 'd',
+    '\N{Latin capital letter H with stroke}': 'H',
+    '\N{Latin small letter h with stroke}': 'h',
+    '\N{Latin small letter dotless i}': 'i',
+    '\N{Latin small letter kra}': 'k',
+    '\N{Latin capital letter L with stroke}': 'L',
+    '\N{Latin small letter l with stroke}': 'l',
+    '\N{Latin capital letter Eng}': 'N',
+    '\N{Latin small letter eng}': 'n',
+    '\N{Latin capital ligature OE}': 'Oe',
+    '\N{Latin small ligature oe}': 'oe',
+    '\N{Latin capital letter T with stroke}': 'T',
+    '\N{Latin small letter t with stroke}': 't',
+
+    # bnewbold additions
+    'μ': 'u',
+    '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+}
+
+SANDCRAWLER_PREFIX_REMOVE = [
+    "original article: ",
+    "original article ",
+    "article: ",
+    "title: ",
+]
+
+# regex that matches all characters which should be removed
+SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
+    r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]")
+
+
+def sandcrawler_slugify(raw: str) -> str:
+    """
+    Python re-implementation of sandcrawler Scala code for string comparison
+    ("scorable" strings)
+    """
+    slug = raw.strip().lower()
+
+    # transforms before running regex
+    for prefix in SANDCRAWLER_PREFIX_REMOVE:
+        if slug.startswith(prefix):
+            slug = slug[:len(prefix)]
+
+    slug = slug.replace("&apos;", "'")
+
+    # iterate over all chars and replace from map, if in map; then lower-case again
+    slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug])
+
+    # early bailout before executing regex
+    if not slug:
+        return ""
+
+    slug = unicodedata.normalize('NFKD', slug)
+    slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
+
+    return slug.lower()
+
+
+def test_sandcrawler_slugify() -> None:
+    test_cases = [
+        ("", ""),
+        ("asdf", "asdf"),
+        ("'Hello World!'", "helloworld"),
+        ("ASDF", "asdf"),
+        ("as\n  df", "asdf"),
+        ("as\u0142  bb \u00f8", "aslbbo"),
+        ("`hello¿", "hello"),
+        ("علمية", "علمية"),
+        ("期刊的数字", "期刊的数字"),
+        ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+        ("μmeter", "umeter"),
+        # TODO: ("salt &and; pepper", "saltpepper"),
+        # TODO: ("new <b>and</b> improved", "newandimproved"),
+
+        # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+        ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="),
+        ("⁰⁴⁵₀₁₂", "045012"),
+        ("社會科學院語學研究所", "社會科學院語學研究所"),
+        # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
+        # TODO: ("表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀", "表ポあa鷗oeebＢ逍usaan丂㐀𠀀"),
+        ("( ͡° ͜ʖ ͡°)", ""),
+        # emoji ok? I guess
+        ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+        ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+        ("﷽ ", "﷽"),
+        ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+         "thenezperdianhivemindofchaoszalgo"),
+        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+    ]
+
+    for in_str, out_str in test_cases:
+        if sandcrawler_slugify(in_str) != out_str:
+            for c in list(sandcrawler_slugify(in_str)):
+                print(unicodedata.name(c))
+                #print(ord(c))
+            print("----")
+            for c in list(out_str):
+                print(unicodedata.name(c))
+            print(in_str)
+        assert sandcrawler_slugify(in_str) == out_str
+
+
+def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]:
+    ident, title = release_key_title(doc)
+    slug = sandcrawler_slugify(title)
+    return (ident, slug)
+
+
 def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
     """
     Derive a key from title and authors. Authors in contribs list:
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index bfce68e..9216808 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -12,18 +12,20 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
 
 import argparse
 import cProfile as profile
+import fileinput
 import io
 import logging
 import pstats
 import sys
 import tempfile
-import fileinput
 
 import orjson as json
 
 from fuzzycat.build import NgramLookup, TitleTokenList
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
-                              release_key_title_nysiis, release_key_title_ngram)
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
+                              release_key_title_normalized, release_key_title_nysiis,
+                              release_key_title_sandcrawler)
+
 
 def run_cluster(args):
     logger = logging.getLogger('main.run_cluster')
@@ -32,6 +34,7 @@ def run_cluster(args):
         'tnorm': release_key_title_normalized,
         'tnysi': release_key_title_nysiis,
         'tss': release_key_title_ngram,
+        'tsandcrawler': release_key_title_sandcrawler,
     }
     key_denylist = None
     if args.key_denylist: