From ba035d8641a5e94d93448bb9a0cd56c7756d7055 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 19:27:17 -0800 Subject: add support for key denylist This is to filter out cluster rows where the resulting key is in a given text file (one key per line). The intent is to filter out records with either poor metadata, or very generic metadata, for fuzzy matching. Eg, in many cases it is better to just not try matching "Letter to the Editor" to any record. This won't always be the case; we might have journal, volume, issue, and page, which would allow a match. So this can be specified on the command line. --- fuzzycat/cluster.py | 13 +- fuzzycat/main.py | 6 + fuzzycat/sandcrawler-title-denylist.txt | 559 ++++++++++++++++++++++++++++++++ 3 files changed, 574 insertions(+), 4 deletions(-) create mode 100644 fuzzycat/sandcrawler-title-denylist.txt diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index e4a36bf..dde0688 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -246,6 +246,7 @@ class Cluster: output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', + key_denylist=None, tmpdir=None): """ Files can be a list of files or "-" for stdin. @@ -256,6 +257,7 @@ class Cluster: self.prefix = prefix self.tmpdir = tmpdir self.logger = logging.getLogger('fuzzycat.cluster') + self.key_denylist = key_denylist def run(self): """ @@ -266,12 +268,15 @@ class Cluster: with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for line in fileinput.input(files=self.files): try: - id, key = keyfunc(json.loads(line)) - print("{}\t{}".format(id, key), file=tf) + ident, key = keyfunc(json.loads(line)) except (KeyError, ValueError): counter["key_extraction_failed"] += 1 - else: - counter["key_ok"] += 1 + continue + if self.key_denylist and key in self.key_denylist: + counter["key_denylist"] += 1 + continue + counter["key_ok"] += 1 + print("{}\t{}".format(ident, key), file=tf) sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir) with open(sbc) as f: comment = keyfunc.__name__ diff --git a/fuzzycat/main.py b/fuzzycat/main.py index c7ba23d..bfce68e 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -33,9 +33,14 @@ def run_cluster(args): 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, } + key_denylist = None + if args.key_denylist: + with open(args.key_denylist, 'r') as f: + key_denylist = [l.strip() for l in f.readlines()] cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, + key_denylist=key_denylist, prefix=args.prefix) stats = cluster.run() logger.debug(json.dumps(dict(stats))) @@ -83,6 +88,7 @@ if __name__ == '__main__': sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-f', '--files', default="-", help='input files') + sub_cluster.add_argument('--key-denylist', help='file path to key denylist') sub_cluster.add_argument('-t', '--type', default='title', diff --git a/fuzzycat/sandcrawler-title-denylist.txt b/fuzzycat/sandcrawler-title-denylist.txt new file mode 100644 index 0000000..ef575b4 --- /dev/null +++ b/fuzzycat/sandcrawler-title-denylist.txt @@ -0,0 +1,559 @@ +abbreviations +abbreviationsandacronyms +aboutauthors +abouttheauthor +abouttheauthors +aboutthecover +abouttheeditors +abreviations +abstract +abstractnotsubmittedforonlinepublication +abstractoriginalarticle +abstracts +abstractsofaapaposterandpodiumpresentations +abstractsofcommunications +abstractsofthesesfromthescandinaviancountries +abstractwithdrawn +acknowledgement +acknowledgements +acknowledgementsvii +acknowledgementtoreferees +acknowledgementtoreviewers +acknowledgment +acknowledgmentofreferees +acknowledgments +addendum +additionalresources +address +advertisement +advertisersindex +affect +affiliation +afterword +agenda +agradecimentos +agradecimientos +aimsandscope +analysis +annexa +announcement +announcements +annualacknowledgementofmanuscriptreviewers +anotefromtheeditor +appendices +appendix +appendix1 +appendixa +appendixb +appointmentsandstaffchanges +approximation +apresentacao +article +articlenumber +articles +articlesofsignificantinterestselectedfromthisissuebytheeditors +associationnews +ataglance +atribute +attention +authorguidelines +authorindex +authorindexforvolume81 +authorreply +authors +authorsreply +authorsresponse +avantpropos +award +awardsappointmentsannouncements +backcover +background +backmatter +berichtigung +besprechungen +bibliografia +bibliographie +bibliography +bigdata +blankpage +blood +boardoftrustees +booknotes +booknotices +bookofabstracts +bookreview +bookreviews +bookreviewsandnotices +bookreviewssection +booksreceived +buchbesprechung +buchbesprechungen +bulletin +calendar +calendarofevents +calendarofmeetings +callforarticles +callforpapers +casereport +casereports +casestudy +chairmansopeningremarks +changes +chaos +chapter1 +chapter10 +chapter1introduction +chapter2 +chapter7 +chapteri +chapterone +chapteroneintroduction +chaptertwo +chapterx +citation +classes +classified +classifieds +closingremarks +collaborateurs +comment +commentaries +commentary +commentaryon +commenton +comments +commentto +committee +communication +communications +communicationstotheeditor +communiquedepresse +community +components +comptesrendus +computerscience +concludingremarks +conclusion +conclusions +conferencereport +congratulations +congresscalendar +conservation +content +contents +context +continuingeducation +continuingmedicaleducation +contributors +copyright +copyrightform +copyrightnotice +correction +corrections +correspondence +corrigenda +corrigendum +councilminutes +cover +coverimage +currentresearch +curriculumvitae +danksagung +dearreaders +decisionmaking +dedication +dedicatoria +definition +description +discussion +diskussion +distribution +documents +ear +economics +editorial +editorialadvisoryboard +editorialannouncement +editorialboard +editorialcomment +editorialcomments +editorialconsultants +editoriale +editorialeditorial +editorialforeword +editorialinformation +editorialintroduction +editorialintroductions +editorialnote +editorialnotes +editorialpreface +editorials +editorialsoftwaresurveysection +editorialstaff +editorialstatement +editorinchief +editors +editorschoice +editorscomment +editorscomments +editorscorner +editorscorrespondence +editorsforeword +editorsintroduction +editorsletter +editorsnote +editorsnotes +editorspage +editorspicks +editorspreface +education +einfuhrung +einleitung +electrophoresis +employment +endnotes +entrevista +entscheidungsverzeichnis +epilogue +equipment +errata +erratum +essay +essays +executivesummary +exercises +expediente +extendedabstracts +feature +features +fichatecnica +figure3 +finalexam +finalreport +focus +foreward +foreword +forthcomingarticles +forthcomingevents +fortherecord +forum +frequentlyaskedquestions +fromtheeditor +fromtheeditorinchief +fromtheeditors +fromtheeditorsdesk +fromthepresident +frontmatter +furtherreadings +genealogy +generaldiscussion +generalinformation +generalintroduction +germany +gettingstarted +glosario +glossary +glossaryofterms +guesteditorial +guesteditorsforeword +guesteditorsintroduction +guideforauthors +guidelinesforcontributors +health +heartfailure +highlights +highlightsfromthisissue +highlightsofthisissue +history +home +homework +hypothesis +iii +imageofthemonth +impactfactor +importantnotice +impressum +inbrief +index +indexofauthors +indexofauthorsandtitles +indice +indicegeneral +informationforauthors +informationtoauthors +inhalt +inhaltsverzeichnis +inleiding +inmemoriam +inreply +inresponse +insidethisissue +institutenews +instructionsforauthors +instructionstoauthors +interview +inthestudy +inthisissue +introducao +introduccion +introduction +introductionandoverview +introductiongenerale +introductiontotheissue +introductiontothespecialissue +introductorycomments +introductoryremarks +introduzione +inventions +invitedcommentary +issuesandevents +jobdescription +journalclub +journalscan +keywords +kurzkommentiert +languageteaching +lecture +letter +letterfromtheeditor +letterfromtheeditorinchief +letterfromtheeditors +letterfromthepresident +letters +letterstotheeditor +letterstotheeditors +lettertotheeditor +lettertotheeditors +liminaire +linearalgebra +linearregression +links +listedestableaux +listofabbreviations +listofcontributors +listoffigures +listofparticipants +listofpublications +listofreferees +listofreviewers +listoftables +literacy +literatur +literature +literaturecited +literaturereview +literaturrundschau +literaturverzeichnis +litteraturverzeichniss +livresrecus +lucina +lungcancer +magazin +maintenance +materials +materialsafetydatasheet +materialsandmethods +medicinalchemistry +meetingabstracts +meetingreport +meetings +meetingsandconferences +meetingsofinterest +membershipapplication +memoranda +memorandum +messagefromgeneralcochairs +messagefromthechairs +messagefromtheeditor +messagefromtheeditorinchief +messagefromthepresident +messagefromtheprogramchairs +messagefromtheprogramcochairs +metaanalysis +miscellanea +miscellaneous +miscellany +missionstatement +motivation +mrsnews +name +newbooks +newlyelectedmembersofthecollege +newproducts +news +newsandnotes +newsandreviews +newsandviews +newsbriefs +newsinbrief +newsnotes +newsviews +noii +note +notefromtheeditor +notes +notesandcomments +notesandnews +notesdelecture +notesforcontributors +notesoncontributors +notice +noticeboard +notitle +notitleavailable +nr +obituaries +obituary +online +openaccess +openingaddress +openingremarks +oralabstracts +oralpresentations +organizingcommittee +originalarticle +originalarticles +other +outline +overview +panorama +papers +paperstoappearinforthcomingissues +partone +personalandmiscellaneous +perspective +perspectives +philosophy +pictureofthemonth +place +pointofview +positionsavailable +poster +posterpresentations +postscript +preface +prefaceandacknowledgements +prefacetothesecondedition +preliminarymaterial +presentacio +presentacion +presentation +presidentialaddress +presidentsmessage +presidentsreport +pressrelease +print +printing +proceedings +proceedingsofthenationalacademyofsciences +profile +programcommittee +projectmanagement +prologue +publication +publichealth +publishersnote +question +questionsandanswers +radiology +readersforum +recensiones +recensions +recentpublications +redaktorensforord +referate +references +referenciasbibliograficas +regression +rehabilitation +rejoinder +remerciements +reply +replybyauthors +researchresearchers +resenas +resources +response +responsetothelettertotheeditor +results +resume +resumen +resumes +resumo +retraction +review +reviewarticle +revieweracknowledgement +revieweracknowledgement2013 +reviewers +reviewessay +reviews +reviewsanddescriptionsoftablesandbooks +reviewsofbooks +rezension +rezensionen +safety +section +security +selectedbibliography +shortcommunication +shorternotices +shortnotices +socialengineering +sociology +sommaire +sommario +specialreport +specialsection +specifications +spistresci +subjectindex +subscriptions +suggestedreadings +sumario +summaries +summariesofkeyjournalarticles +summary +summaryofproceedings +summer +sun +supplementarymaterial +symposium +symptom +synthese +tabledesmatieres +tableofcontents +tableofcontentsandprologue +technicalreport +theauthors +theauthorsreply +thebasics +theeditorsdesk +thefirstauthorreplies +thelancet +theoreticalbackground +thetimes +theworldbank +theyearinreview +thismonthin +thismonthinthejournal +timemanagement +titeleiinhaltsverzeichnis +title +titlepage +titlepagei +tocorrespondents +totheeditor +unitedkingdom +unitednations +unitedstates +upcomingevents +vorwort +website +welcome +whatshappening +whatsnew +workscited +yourquestionsanswered +zudiesemheft +zusammenfassung -- cgit v1.2.3 From 1e413f8fb818bc211f128b63110327d1e3f88152 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 21:53:03 -0800 Subject: make: run pytest over fuzzycat/ to catch inline tests --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 7a0490e..25efac0 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ deps: ## Install dependencies from setup.py into pipenv pipenv install --pre '-e .[dev]' .PHONY: style -style: ## Apply import sorting and black source formatting on all files +style: ## Apply import sorting and yapf source formatting on all files isort --atomic fuzzycat/* yapf -p -i -r fuzzycat/* yapf -p -i -r tests @@ -27,11 +27,11 @@ dist: ## Create source distribution and wheel .PHONY: cov cov: ## Run coverage report - pytest --cov=fuzzycat tests/ + pytest --cov=fuzzycat fuzzycat/*.py tests/ .PHONY: test test: ## Run coverage report - pytest -v tests/ + pytest -v fuzzycat/*.py tests/ .PHONY: lint lint: $(PY_FILES) -- cgit v1.2.3 From 2799c8e18f82e6a01ae15e257ed9e0de383bf806 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 21:54:32 -0800 Subject: gitignore: add .swp (vim) --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index a1e72a2..d3e0e29 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,7 @@ dmypy.json /names.db /tmp fixtures/cluster_title_normalized_dups_size_keylen.tsv + +# Text Editors +*~ +*.swp -- cgit v1.2.3 From b70a5a18d3091fd61c41391a937e9fef1b27cf9c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 21:54:54 -0800 Subject: pipenv: explicit regex dependency regex, unlike stdlib 're' module, has unicode support. I couldn't get pipenv to lock after adding this dependency, even though Pipfile.lock already includes regex as a sub-dependency of something else. --- Pipfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Pipfile b/Pipfile index c3606c8..39dedee 100644 --- a/Pipfile +++ b/Pipfile @@ -23,6 +23,7 @@ pydantic = "*" tokenizers = "*" spacy = "*" nltk = "*" +regex = "*" [requires] python_version = "3.7" -- cgit v1.2.3 From d299104f3485c7e99738b521a78adc7e5452f995 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 21:58:20 -0800 Subject: cluster: count empty keys (and don't return them) --- fuzzycat/cluster.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index dde0688..9a8d5db 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -272,6 +272,9 @@ class Cluster: except (KeyError, ValueError): counter["key_extraction_failed"] += 1 continue + if not key: + counter["key_empty"] += 1 + continue if self.key_denylist and key in self.key_denylist: counter["key_denylist"] += 1 continue -- cgit v1.2.3 From 7dba5c40ad36515f96ccfd44624d508a91d00da0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 21:58:54 -0800 Subject: add sandcrawler-style title key method --- fuzzycat/cluster.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fuzzycat/main.py | 9 ++-- 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 9a8d5db..23aebbb 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -69,15 +69,18 @@ import string import subprocess import sys import tempfile +import unicodedata from typing import Any, Callable, Dict, Generator, List, Optional, Tuple import fuzzy +import regex from pydantic import BaseModel __all__ = [ "release_key_title", "release_key_title_normalized", "release_key_title_nysiis", + "release_key_title_sandcrawler", "sort_by_column", "group_by", "Cluster", @@ -103,6 +106,7 @@ class KeyDoc(BaseModel): title: Optional[str] contribs: Optional[List[Contrib]] + class ClusterResult(BaseModel): """ Result of clustering. @@ -155,6 +159,128 @@ def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]: return (ident, fuzzy.nysiis(title)) +# from http://zderadicka.eu/removing-diacritics-marks-from-strings/ +SANDCRAWLER_CHAR_MAP = { + '\N{Latin capital letter AE}': 'AE', + '\N{Latin small letter ae}': 'ae', + '\N{Latin capital letter Eth}': 'D', + '\N{Latin small letter eth}': 'd', + '\N{Latin capital letter O with stroke}': 'O', + '\N{Latin small letter o with stroke}': 'o', + '\N{Latin capital letter Thorn}': 'Th', + '\N{Latin small letter thorn}': 'th', + '\N{Latin small letter sharp s}': 's', + '\N{Latin capital letter D with stroke}': 'D', + '\N{Latin small letter d with stroke}': 'd', + '\N{Latin capital letter H with stroke}': 'H', + '\N{Latin small letter h with stroke}': 'h', + '\N{Latin small letter dotless i}': 'i', + '\N{Latin small letter kra}': 'k', + '\N{Latin capital letter L with stroke}': 'L', + '\N{Latin small letter l with stroke}': 'l', + '\N{Latin capital letter Eng}': 'N', + '\N{Latin small letter eng}': 'n', + '\N{Latin capital ligature OE}': 'Oe', + '\N{Latin small ligature oe}': 'oe', + '\N{Latin capital letter T with stroke}': 'T', + '\N{Latin small letter t with stroke}': 't', + + # bnewbold additions + 'μ': 'u', + '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', +} + +SANDCRAWLER_PREFIX_REMOVE = [ + "original article: ", + "original article ", + "article: ", + "title: ", +] + +# regex that matches all characters which should be removed +SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( + r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]") + + +def sandcrawler_slugify(raw: str) -> str: + """ + Python re-implementation of sandcrawler Scala code for string comparison + ("scorable" strings) + """ + slug = raw.strip().lower() + + # transforms before running regex + for prefix in SANDCRAWLER_PREFIX_REMOVE: + if slug.startswith(prefix): + slug = slug[:len(prefix)] + + slug = slug.replace("'", "'") + + # iterate over all chars and replace from map, if in map; then lower-case again + slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug]) + + # early bailout before executing regex + if not slug: + return "" + + slug = unicodedata.normalize('NFKD', slug) + slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug) + + return slug.lower() + + +def test_sandcrawler_slugify() -> None: + test_cases = [ + ("", ""), + ("asdf", "asdf"), + ("'Hello World!'", "helloworld"), + ("ASDF", "asdf"), + ("as\n df", "asdf"), + ("as\u0142 bb \u00f8", "aslbbo"), + ("`hello¿", "hello"), + ("علمية", "علمية"), + ("期刊的数字", "期刊的数字"), + ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("μmeter", "umeter"), + # TODO: ("salt ∧ pepper", "saltpepper"), + # TODO: ("new and improved", "newandimproved"), + + # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt + ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="), + ("⁰⁴⁵₀₁₂", "045012"), + ("社會科學院語學研究所", "社會科學院語學研究所"), + # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"), + # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"), + ("( ͡° ͜ʖ ͡°)", ""), + # emoji ok? I guess + ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), + ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), + ("﷽ ", "﷽"), + ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", + "thenezperdianhivemindofchaoszalgo"), + ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), + ] + + for in_str, out_str in test_cases: + if sandcrawler_slugify(in_str) != out_str: + for c in list(sandcrawler_slugify(in_str)): + print(unicodedata.name(c)) + #print(ord(c)) + print("----") + for c in list(out_str): + print(unicodedata.name(c)) + print(in_str) + assert sandcrawler_slugify(in_str) == out_str + + +def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]: + ident, title = release_key_title(doc) + slug = sandcrawler_slugify(title) + return (ident, slug) + + def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: """ Derive a key from title and authors. Authors in contribs list: diff --git a/fuzzycat/main.py b/fuzzycat/main.py index bfce68e..9216808 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -12,18 +12,20 @@ Run, e.g. fuzzycat cluster --help for more options. Example: import argparse import cProfile as profile +import fileinput import io import logging import pstats import sys import tempfile -import fileinput import orjson as json from fuzzycat.build import NgramLookup, TitleTokenList -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, - release_key_title_nysiis, release_key_title_ngram) +from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, + release_key_title_normalized, release_key_title_nysiis, + release_key_title_sandcrawler) + def run_cluster(args): logger = logging.getLogger('main.run_cluster') @@ -32,6 +34,7 @@ def run_cluster(args): 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, + 'tsandcrawler': release_key_title_sandcrawler, } key_denylist = None if args.key_denylist: -- cgit v1.2.3 From 30eab70787584a333714b18f1d64f362e4768730 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 22:58:18 -0800 Subject: sandcrawler slugify: yet more unicode corner-cases --- fuzzycat/cluster.py | 63 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 23aebbb..289fd30 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -186,8 +186,17 @@ SANDCRAWLER_CHAR_MAP = { '\N{Latin small letter t with stroke}': 't', # bnewbold additions - 'μ': 'u', + '\N{MICRO SIGN}': 'u', + '\N{LATIN SMALL LETTER C}': 'c', + '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + # bnewbold map-to-null (for non-printing stuff not in the regex) + '\N{PARTIAL DIFFERENTIAL}': '', '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', + '\N{N-ARY SUMMATION}': '', + '\N{N-ARY PRODUCT}': '', + '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '', + '\N{SNOWMAN}': '', + '\N{CARON}': '', } SANDCRAWLER_PREFIX_REMOVE = [ @@ -199,7 +208,8 @@ SANDCRAWLER_PREFIX_REMOVE = [ # regex that matches all characters which should be removed SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( - r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]") + r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" +) def sandcrawler_slugify(raw: str) -> str: @@ -217,7 +227,7 @@ def sandcrawler_slugify(raw: str) -> str: slug = slug.replace("'", "'") # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug]) + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]) # early bailout before executing regex if not slug: @@ -241,32 +251,53 @@ def test_sandcrawler_slugify() -> None: ("علمية", "علمية"), ("期刊的数字", "期刊的数字"), ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), - ("μmeter", "umeter"), + + # "MICRO SIGN" + ("\xb5meter", "umeter"), + # "GREEK SMALL LETTER MU" + ("\u03bcmeter", "\u03bcmeter"), + # TODO: ("salt ∧ pepper", "saltpepper"), # TODO: ("new and improved", "newandimproved"), # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt - ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="), - ("⁰⁴⁵₀₁₂", "045012"), - ("社會科學院語學研究所", "社會科學院語學研究所"), + ("-9223372036854775808/-1", "92233720368547758081"), + (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""), + (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000", + ""), + (r"Ω≈ç√∫˜≤≥÷", "ωc"), + (r"åß∂ƒ©˙∆˚¬…æ", "asfae"), + (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"), + (r"¡™£¢∞§¶•ªº–≠ ", "tmao"), + (r"¸˛Ç◊ı˜Â¯˘¿", "cia"), + (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"), + (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"), + (r"`⁄€‹›fifl‡°·‚—±", "fifl"), + (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", + "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"), + (r"⁰⁴⁵₀₁₂", "045012"), + (r"社會科學院語學研究所", "社會科學院語學研究所"), # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"), # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"), - ("( ͡° ͜ʖ ͡°)", ""), + (r"( ͡° ͜ʖ ͡°)", ""), # emoji ok? I guess - ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), - ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), - ("﷽ ", "﷽"), - ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", + (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), + (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), + (r"﷽ ", "﷽"), + (r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", "thenezperdianhivemindofchaoszalgo"), - ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), - ("The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), - ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), + (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), ] for in_str, out_str in test_cases: if sandcrawler_slugify(in_str) != out_str: for c in list(sandcrawler_slugify(in_str)): - print(unicodedata.name(c)) + try: + print(unicodedata.name(c)) + except ValueError: + print(ord(c)) #print(ord(c)) print("----") for c in list(out_str): -- cgit v1.2.3