aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py126
-rw-r--r--fuzzycat/main.py9
2 files changed, 132 insertions, 3 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 9a8d5db..23aebbb 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -69,15 +69,18 @@ import string
import subprocess
import sys
import tempfile
+import unicodedata
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
import fuzzy
+import regex
from pydantic import BaseModel
__all__ = [
"release_key_title",
"release_key_title_normalized",
"release_key_title_nysiis",
+ "release_key_title_sandcrawler",
"sort_by_column",
"group_by",
"Cluster",
@@ -103,6 +106,7 @@ class KeyDoc(BaseModel):
title: Optional[str]
contribs: Optional[List[Contrib]]
+
class ClusterResult(BaseModel):
"""
Result of clustering.
@@ -155,6 +159,128 @@ def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
return (ident, fuzzy.nysiis(title))
+# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
+SANDCRAWLER_CHAR_MAP = {
+ '\N{Latin capital letter AE}': 'AE',
+ '\N{Latin small letter ae}': 'ae',
+ '\N{Latin capital letter Eth}': 'D',
+ '\N{Latin small letter eth}': 'd',
+ '\N{Latin capital letter O with stroke}': 'O',
+ '\N{Latin small letter o with stroke}': 'o',
+ '\N{Latin capital letter Thorn}': 'Th',
+ '\N{Latin small letter thorn}': 'th',
+ '\N{Latin small letter sharp s}': 's',
+ '\N{Latin capital letter D with stroke}': 'D',
+ '\N{Latin small letter d with stroke}': 'd',
+ '\N{Latin capital letter H with stroke}': 'H',
+ '\N{Latin small letter h with stroke}': 'h',
+ '\N{Latin small letter dotless i}': 'i',
+ '\N{Latin small letter kra}': 'k',
+ '\N{Latin capital letter L with stroke}': 'L',
+ '\N{Latin small letter l with stroke}': 'l',
+ '\N{Latin capital letter Eng}': 'N',
+ '\N{Latin small letter eng}': 'n',
+ '\N{Latin capital ligature OE}': 'Oe',
+ '\N{Latin small ligature oe}': 'oe',
+ '\N{Latin capital letter T with stroke}': 'T',
+ '\N{Latin small letter t with stroke}': 't',
+
+ # bnewbold additions
+ 'μ': 'u',
+ '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+}
+
+SANDCRAWLER_PREFIX_REMOVE = [
+ "original article: ",
+ "original article ",
+ "article: ",
+ "title: ",
+]
+
+# regex that matches all characters which should be removed
+SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
+ r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]")
+
+
+def sandcrawler_slugify(raw: str) -> str:
+ """
+ Python re-implementation of sandcrawler Scala code for string comparison
+ ("scorable" strings)
+ """
+ slug = raw.strip().lower()
+
+ # transforms before running regex
+ for prefix in SANDCRAWLER_PREFIX_REMOVE:
+ if slug.startswith(prefix):
+ slug = slug[:len(prefix)]
+
+ slug = slug.replace("'", "'")
+
+ # iterate over all chars and replace from map, if in map; then lower-case again
+ slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug])
+
+ # early bailout before executing regex
+ if not slug:
+ return ""
+
+ slug = unicodedata.normalize('NFKD', slug)
+ slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
+
+ return slug.lower()
+
+
+def test_sandcrawler_slugify() -> None:
+ test_cases = [
+ ("", ""),
+ ("asdf", "asdf"),
+ ("'Hello World!'", "helloworld"),
+ ("ASDF", "asdf"),
+ ("as\n df", "asdf"),
+ ("as\u0142 bb \u00f8", "aslbbo"),
+ ("`hello¿", "hello"),
+ ("علمية", "علمية"),
+ ("期刊的数字", "期刊的数字"),
+ ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+ ("μmeter", "umeter"),
+ # TODO: ("salt ∧ pepper", "saltpepper"),
+ # TODO: ("new <b>and</b> improved", "newandimproved"),
+
+ # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+ ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="),
+ ("⁰⁴⁵₀₁₂", "045012"),
+ ("社會科學院語學研究所", "社會科學院語學研究所"),
+ # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
+ # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"),
+ ("( ͡° ͜ʖ ͡°)", ""),
+ # emoji ok? I guess
+ ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+ ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+ ("﷽ ", "﷽"),
+ ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+ "thenezperdianhivemindofchaoszalgo"),
+ ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
+ ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
+ ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+ ]
+
+ for in_str, out_str in test_cases:
+ if sandcrawler_slugify(in_str) != out_str:
+ for c in list(sandcrawler_slugify(in_str)):
+ print(unicodedata.name(c))
+ #print(ord(c))
+ print("----")
+ for c in list(out_str):
+ print(unicodedata.name(c))
+ print(in_str)
+ assert sandcrawler_slugify(in_str) == out_str
+
+
+def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]:
+ ident, title = release_key_title(doc)
+ slug = sandcrawler_slugify(title)
+ return (ident, slug)
+
+
def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
"""
Derive a key from title and authors. Authors in contribs list:
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index bfce68e..9216808 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -12,18 +12,20 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
import argparse
import cProfile as profile
+import fileinput
import io
import logging
import pstats
import sys
import tempfile
-import fileinput
import orjson as json
from fuzzycat.build import NgramLookup, TitleTokenList
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
- release_key_title_nysiis, release_key_title_ngram)
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
+ release_key_title_normalized, release_key_title_nysiis,
+ release_key_title_sandcrawler)
+
def run_cluster(args):
logger = logging.getLogger('main.run_cluster')
@@ -32,6 +34,7 @@ def run_cluster(args):
'tnorm': release_key_title_normalized,
'tnysi': release_key_title_nysiis,
'tss': release_key_title_ngram,
+ 'tsandcrawler': release_key_title_sandcrawler,
}
key_denylist = None
if args.key_denylist: