diff options
-rw-r--r-- | fuzzycat/cluster.py | 126 | ||||
-rw-r--r-- | fuzzycat/main.py | 9 |
2 files changed, 132 insertions, 3 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 9a8d5db..23aebbb 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -69,15 +69,18 @@ import string import subprocess import sys import tempfile +import unicodedata from typing import Any, Callable, Dict, Generator, List, Optional, Tuple import fuzzy +import regex from pydantic import BaseModel __all__ = [ "release_key_title", "release_key_title_normalized", "release_key_title_nysiis", + "release_key_title_sandcrawler", "sort_by_column", "group_by", "Cluster", @@ -103,6 +106,7 @@ class KeyDoc(BaseModel): title: Optional[str] contribs: Optional[List[Contrib]] + class ClusterResult(BaseModel): """ Result of clustering. @@ -155,6 +159,128 @@ def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]: return (ident, fuzzy.nysiis(title)) +# from http://zderadicka.eu/removing-diacritics-marks-from-strings/ +SANDCRAWLER_CHAR_MAP = { + '\N{Latin capital letter AE}': 'AE', + '\N{Latin small letter ae}': 'ae', + '\N{Latin capital letter Eth}': 'D', + '\N{Latin small letter eth}': 'd', + '\N{Latin capital letter O with stroke}': 'O', + '\N{Latin small letter o with stroke}': 'o', + '\N{Latin capital letter Thorn}': 'Th', + '\N{Latin small letter thorn}': 'th', + '\N{Latin small letter sharp s}': 's', + '\N{Latin capital letter D with stroke}': 'D', + '\N{Latin small letter d with stroke}': 'd', + '\N{Latin capital letter H with stroke}': 'H', + '\N{Latin small letter h with stroke}': 'h', + '\N{Latin small letter dotless i}': 'i', + '\N{Latin small letter kra}': 'k', + '\N{Latin capital letter L with stroke}': 'L', + '\N{Latin small letter l with stroke}': 'l', + '\N{Latin capital letter Eng}': 'N', + '\N{Latin small letter eng}': 'n', + '\N{Latin capital ligature OE}': 'Oe', + '\N{Latin small ligature oe}': 'oe', + '\N{Latin capital letter T with stroke}': 'T', + '\N{Latin small letter t with stroke}': 't', + + # bnewbold additions + 'μ': 'u', + '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', +} + +SANDCRAWLER_PREFIX_REMOVE = [ + "original article: ", + "original article ", + "article: ", + "title: ", +] + +# regex that matches all characters which should be removed +SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( + r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]") + + +def sandcrawler_slugify(raw: str) -> str: + """ + Python re-implementation of sandcrawler Scala code for string comparison + ("scorable" strings) + """ + slug = raw.strip().lower() + + # transforms before running regex + for prefix in SANDCRAWLER_PREFIX_REMOVE: + if slug.startswith(prefix): + slug = slug[:len(prefix)] + + slug = slug.replace("'", "'") + + # iterate over all chars and replace from map, if in map; then lower-case again + slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug]) + + # early bailout before executing regex + if not slug: + return "" + + slug = unicodedata.normalize('NFKD', slug) + slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug) + + return slug.lower() + + +def test_sandcrawler_slugify() -> None: + test_cases = [ + ("", ""), + ("asdf", "asdf"), + ("'Hello World!'", "helloworld"), + ("ASDF", "asdf"), + ("as\n df", "asdf"), + ("as\u0142 bb \u00f8", "aslbbo"), + ("`hello¿", "hello"), + ("علمية", "علمية"), + ("期刊的数字", "期刊的数字"), + ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("μmeter", "umeter"), + # TODO: ("salt ∧ pepper", "saltpepper"), + # TODO: ("new <b>and</b> improved", "newandimproved"), + + # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt + ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="), + ("⁰⁴⁵₀₁₂", "045012"), + ("社會科學院語學研究所", "社會科學院語學研究所"), + # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"), + # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"), + ("( ͡° ͜ʖ ͡°)", ""), + # emoji ok? I guess + ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), + ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), + ("﷽ ", "﷽"), + ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", + "thenezperdianhivemindofchaoszalgo"), + ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), + ] + + for in_str, out_str in test_cases: + if sandcrawler_slugify(in_str) != out_str: + for c in list(sandcrawler_slugify(in_str)): + print(unicodedata.name(c)) + #print(ord(c)) + print("----") + for c in list(out_str): + print(unicodedata.name(c)) + print(in_str) + assert sandcrawler_slugify(in_str) == out_str + + +def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]: + ident, title = release_key_title(doc) + slug = sandcrawler_slugify(title) + return (ident, slug) + + def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: """ Derive a key from title and authors. Authors in contribs list: diff --git a/fuzzycat/main.py b/fuzzycat/main.py index bfce68e..9216808 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -12,18 +12,20 @@ Run, e.g. fuzzycat cluster --help for more options. Example: import argparse import cProfile as profile +import fileinput import io import logging import pstats import sys import tempfile -import fileinput import orjson as json from fuzzycat.build import NgramLookup, TitleTokenList -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, - release_key_title_nysiis, release_key_title_ngram) +from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, + release_key_title_normalized, release_key_title_nysiis, + release_key_title_sandcrawler) + def run_cluster(args): logger = logging.getLogger('main.run_cluster') @@ -32,6 +34,7 @@ def run_cluster(args): 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, + 'tsandcrawler': release_key_title_sandcrawler, } key_denylist = None if args.key_denylist: |