From ba035d8641a5e94d93448bb9a0cd56c7756d7055 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 19:27:17 -0800
Subject: add support for key denylist

This is to filter out cluster rows where the resulting key is in a given
text file (one key per line).

The intent is to filter out records with either poor metadata, or very
generic metadata, for fuzzy matching. Eg, in many cases it is better to
just not try matching "Letter to the Editor" to any record. This won't
always be the case; we might have journal, volume, issue, and page,
which would allow a match. So this can be specified on the command line.
---
 fuzzycat/cluster.py                     |  13 +-
 fuzzycat/main.py                        |   6 +
 fuzzycat/sandcrawler-title-denylist.txt | 559 ++++++++++++++++++++++++++++++++
 3 files changed, 574 insertions(+), 4 deletions(-)
 create mode 100644 fuzzycat/sandcrawler-title-denylist.txt

diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index e4a36bf..dde0688 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -246,6 +246,7 @@ class Cluster:
                  output=sys.stdout,
                  keyfunc=lambda v: v,
                  prefix='fuzzycat-',
+                 key_denylist=None,
                  tmpdir=None):
         """
         Files can be a list of files or "-" for stdin.
@@ -256,6 +257,7 @@ class Cluster:
         self.prefix = prefix
         self.tmpdir = tmpdir
         self.logger = logging.getLogger('fuzzycat.cluster')
+        self.key_denylist = key_denylist
 
     def run(self):
         """
@@ -266,12 +268,15 @@ class Cluster:
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
             for line in fileinput.input(files=self.files):
                 try:
-                    id, key = keyfunc(json.loads(line))
-                    print("{}\t{}".format(id, key), file=tf)
+                    ident, key = keyfunc(json.loads(line))
                 except (KeyError, ValueError):
                     counter["key_extraction_failed"] += 1
-                else:
-                    counter["key_ok"] += 1
+                    continue
+                if self.key_denylist and key in self.key_denylist:
+                    counter["key_denylist"] += 1
+                    continue
+                counter["key_ok"] += 1
+                print("{}\t{}".format(ident, key), file=tf)
         sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
         with open(sbc) as f:
             comment = keyfunc.__name__
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index c7ba23d..bfce68e 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -33,9 +33,14 @@ def run_cluster(args):
         'tnysi': release_key_title_nysiis,
         'tss': release_key_title_ngram,
     }
+    key_denylist = None
+    if args.key_denylist:
+        with open(args.key_denylist, 'r') as f:
+            key_denylist = [l.strip() for l in f.readlines()]
     cluster = Cluster(files=args.files,
                       keyfunc=types.get(args.type),
                       tmpdir=args.tmpdir,
+                      key_denylist=key_denylist,
                       prefix=args.prefix)
     stats = cluster.run()
     logger.debug(json.dumps(dict(stats)))
@@ -83,6 +88,7 @@ if __name__ == '__main__':
     sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
     sub_cluster.set_defaults(func=run_cluster)
     sub_cluster.add_argument('-f', '--files', default="-", help='input files')
+    sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
     sub_cluster.add_argument('-t',
                              '--type',
                              default='title',
diff --git a/fuzzycat/sandcrawler-title-denylist.txt b/fuzzycat/sandcrawler-title-denylist.txt
new file mode 100644
index 0000000..ef575b4
--- /dev/null
+++ b/fuzzycat/sandcrawler-title-denylist.txt
@@ -0,0 +1,559 @@
+abbreviations
+abbreviationsandacronyms
+aboutauthors
+abouttheauthor
+abouttheauthors
+aboutthecover
+abouttheeditors
+abreviations
+abstract
+abstractnotsubmittedforonlinepublication
+abstractoriginalarticle
+abstracts
+abstractsofaapaposterandpodiumpresentations
+abstractsofcommunications
+abstractsofthesesfromthescandinaviancountries
+abstractwithdrawn
+acknowledgement
+acknowledgements
+acknowledgementsvii
+acknowledgementtoreferees
+acknowledgementtoreviewers
+acknowledgment
+acknowledgmentofreferees
+acknowledgments
+addendum
+additionalresources
+address
+advertisement
+advertisersindex
+affect
+affiliation
+afterword
+agenda
+agradecimentos
+agradecimientos
+aimsandscope
+analysis
+annexa
+announcement
+announcements
+annualacknowledgementofmanuscriptreviewers
+anotefromtheeditor
+appendices
+appendix
+appendix1
+appendixa
+appendixb
+appointmentsandstaffchanges
+approximation
+apresentacao
+article
+articlenumber
+articles
+articlesofsignificantinterestselectedfromthisissuebytheeditors
+associationnews
+ataglance
+atribute
+attention
+authorguidelines
+authorindex
+authorindexforvolume81
+authorreply
+authors
+authorsreply
+authorsresponse
+avantpropos
+award
+awardsappointmentsannouncements
+backcover
+background
+backmatter
+berichtigung
+besprechungen
+bibliografia
+bibliographie
+bibliography
+bigdata
+blankpage
+blood
+boardoftrustees
+booknotes
+booknotices
+bookofabstracts
+bookreview
+bookreviews
+bookreviewsandnotices
+bookreviewssection
+booksreceived
+buchbesprechung
+buchbesprechungen
+bulletin
+calendar
+calendarofevents
+calendarofmeetings
+callforarticles
+callforpapers
+casereport
+casereports
+casestudy
+chairmansopeningremarks
+changes
+chaos
+chapter1
+chapter10
+chapter1introduction
+chapter2
+chapter7
+chapteri
+chapterone
+chapteroneintroduction
+chaptertwo
+chapterx
+citation
+classes
+classified
+classifieds
+closingremarks
+collaborateurs
+comment
+commentaries
+commentary
+commentaryon
+commenton
+comments
+commentto
+committee
+communication
+communications
+communicationstotheeditor
+communiquedepresse
+community
+components
+comptesrendus
+computerscience
+concludingremarks
+conclusion
+conclusions
+conferencereport
+congratulations
+congresscalendar
+conservation
+content
+contents
+context
+continuingeducation
+continuingmedicaleducation
+contributors
+copyright
+copyrightform
+copyrightnotice
+correction
+corrections
+correspondence
+corrigenda
+corrigendum
+councilminutes
+cover
+coverimage
+currentresearch
+curriculumvitae
+danksagung
+dearreaders
+decisionmaking
+dedication
+dedicatoria
+definition
+description
+discussion
+diskussion
+distribution
+documents
+ear
+economics
+editorial
+editorialadvisoryboard
+editorialannouncement
+editorialboard
+editorialcomment
+editorialcomments
+editorialconsultants
+editoriale
+editorialeditorial
+editorialforeword
+editorialinformation
+editorialintroduction
+editorialintroductions
+editorialnote
+editorialnotes
+editorialpreface
+editorials
+editorialsoftwaresurveysection
+editorialstaff
+editorialstatement
+editorinchief
+editors
+editorschoice
+editorscomment
+editorscomments
+editorscorner
+editorscorrespondence
+editorsforeword
+editorsintroduction
+editorsletter
+editorsnote
+editorsnotes
+editorspage
+editorspicks
+editorspreface
+education
+einfuhrung
+einleitung
+electrophoresis
+employment
+endnotes
+entrevista
+entscheidungsverzeichnis
+epilogue
+equipment
+errata
+erratum
+essay
+essays
+executivesummary
+exercises
+expediente
+extendedabstracts
+feature
+features
+fichatecnica
+figure3
+finalexam
+finalreport
+focus
+foreward
+foreword
+forthcomingarticles
+forthcomingevents
+fortherecord
+forum
+frequentlyaskedquestions
+fromtheeditor
+fromtheeditorinchief
+fromtheeditors
+fromtheeditorsdesk
+fromthepresident
+frontmatter
+furtherreadings
+genealogy
+generaldiscussion
+generalinformation
+generalintroduction
+germany
+gettingstarted
+glosario
+glossary
+glossaryofterms
+guesteditorial
+guesteditorsforeword
+guesteditorsintroduction
+guideforauthors
+guidelinesforcontributors
+health
+heartfailure
+highlights
+highlightsfromthisissue
+highlightsofthisissue
+history
+home
+homework
+hypothesis
+iii
+imageofthemonth
+impactfactor
+importantnotice
+impressum
+inbrief
+index
+indexofauthors
+indexofauthorsandtitles
+indice
+indicegeneral
+informationforauthors
+informationtoauthors
+inhalt
+inhaltsverzeichnis
+inleiding
+inmemoriam
+inreply
+inresponse
+insidethisissue
+institutenews
+instructionsforauthors
+instructionstoauthors
+interview
+inthestudy
+inthisissue
+introducao
+introduccion
+introduction
+introductionandoverview
+introductiongenerale
+introductiontotheissue
+introductiontothespecialissue
+introductorycomments
+introductoryremarks
+introduzione
+inventions
+invitedcommentary
+issuesandevents
+jobdescription
+journalclub
+journalscan
+keywords
+kurzkommentiert
+languageteaching
+lecture
+letter
+letterfromtheeditor
+letterfromtheeditorinchief
+letterfromtheeditors
+letterfromthepresident
+letters
+letterstotheeditor
+letterstotheeditors
+lettertotheeditor
+lettertotheeditors
+liminaire
+linearalgebra
+linearregression
+links
+listedestableaux
+listofabbreviations
+listofcontributors
+listoffigures
+listofparticipants
+listofpublications
+listofreferees
+listofreviewers
+listoftables
+literacy
+literatur
+literature
+literaturecited
+literaturereview
+literaturrundschau
+literaturverzeichnis
+litteraturverzeichniss
+livresrecus
+lucina
+lungcancer
+magazin
+maintenance
+materials
+materialsafetydatasheet
+materialsandmethods
+medicinalchemistry
+meetingabstracts
+meetingreport
+meetings
+meetingsandconferences
+meetingsofinterest
+membershipapplication
+memoranda
+memorandum
+messagefromgeneralcochairs
+messagefromthechairs
+messagefromtheeditor
+messagefromtheeditorinchief
+messagefromthepresident
+messagefromtheprogramchairs
+messagefromtheprogramcochairs
+metaanalysis
+miscellanea
+miscellaneous
+miscellany
+missionstatement
+motivation
+mrsnews
+name
+newbooks
+newlyelectedmembersofthecollege
+newproducts
+news
+newsandnotes
+newsandreviews
+newsandviews
+newsbriefs
+newsinbrief
+newsnotes
+newsviews
+noii
+note
+notefromtheeditor
+notes
+notesandcomments
+notesandnews
+notesdelecture
+notesforcontributors
+notesoncontributors
+notice
+noticeboard
+notitle
+notitleavailable
+nr
+obituaries
+obituary
+online
+openaccess
+openingaddress
+openingremarks
+oralabstracts
+oralpresentations
+organizingcommittee
+originalarticle
+originalarticles
+other
+outline
+overview
+panorama
+papers
+paperstoappearinforthcomingissues
+partone
+personalandmiscellaneous
+perspective
+perspectives
+philosophy
+pictureofthemonth
+place
+pointofview
+positionsavailable
+poster
+posterpresentations
+postscript
+preface
+prefaceandacknowledgements
+prefacetothesecondedition
+preliminarymaterial
+presentacio
+presentacion
+presentation
+presidentialaddress
+presidentsmessage
+presidentsreport
+pressrelease
+print
+printing
+proceedings
+proceedingsofthenationalacademyofsciences
+profile
+programcommittee
+projectmanagement
+prologue
+publication
+publichealth
+publishersnote
+question
+questionsandanswers
+radiology
+readersforum
+recensiones
+recensions
+recentpublications
+redaktorensforord
+referate
+references
+referenciasbibliograficas
+regression
+rehabilitation
+rejoinder
+remerciements
+reply
+replybyauthors
+researchresearchers
+resenas
+resources
+response
+responsetothelettertotheeditor
+results
+resume
+resumen
+resumes
+resumo
+retraction
+review
+reviewarticle
+revieweracknowledgement
+revieweracknowledgement2013
+reviewers
+reviewessay
+reviews
+reviewsanddescriptionsoftablesandbooks
+reviewsofbooks
+rezension
+rezensionen
+safety
+section
+security
+selectedbibliography
+shortcommunication
+shorternotices
+shortnotices
+socialengineering
+sociology
+sommaire
+sommario
+specialreport
+specialsection
+specifications
+spistresci
+subjectindex
+subscriptions
+suggestedreadings
+sumario
+summaries
+summariesofkeyjournalarticles
+summary
+summaryofproceedings
+summer
+sun
+supplementarymaterial
+symposium
+symptom
+synthese
+tabledesmatieres
+tableofcontents
+tableofcontentsandprologue
+technicalreport
+theauthors
+theauthorsreply
+thebasics
+theeditorsdesk
+thefirstauthorreplies
+thelancet
+theoreticalbackground
+thetimes
+theworldbank
+theyearinreview
+thismonthin
+thismonthinthejournal
+timemanagement
+titeleiinhaltsverzeichnis
+title
+titlepage
+titlepagei
+tocorrespondents
+totheeditor
+unitedkingdom
+unitednations
+unitedstates
+upcomingevents
+vorwort
+website
+welcome
+whatshappening
+whatsnew
+workscited
+yourquestionsanswered
+zudiesemheft
+zusammenfassung
-- 
cgit v1.2.3


From 1e413f8fb818bc211f128b63110327d1e3f88152 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 21:53:03 -0800
Subject: make: run pytest over fuzzycat/ to catch inline tests

---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 7a0490e..25efac0 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@ deps: ## Install dependencies from setup.py into pipenv
 	pipenv install --pre '-e .[dev]'
 
 .PHONY: style
-style: ## Apply import sorting and black source formatting on all files
+style: ## Apply import sorting and yapf source formatting on all files
 	isort --atomic fuzzycat/*
 	yapf -p -i -r fuzzycat/*
 	yapf -p -i -r tests
@@ -27,11 +27,11 @@ dist: ## Create source distribution and wheel
 
 .PHONY: cov
 cov: ## Run coverage report
-	pytest --cov=fuzzycat tests/
+	pytest --cov=fuzzycat fuzzycat/*.py tests/
 
 .PHONY: test
 test: ## Run coverage report
-	pytest -v tests/
+	pytest -v fuzzycat/*.py tests/
 
 .PHONY: lint
 lint: $(PY_FILES)
-- 
cgit v1.2.3


From 2799c8e18f82e6a01ae15e257ed9e0de383bf806 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 21:54:32 -0800
Subject: gitignore: add .swp (vim)

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index a1e72a2..d3e0e29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,7 @@ dmypy.json
 /names.db
 /tmp
 fixtures/cluster_title_normalized_dups_size_keylen.tsv
+
+# Text Editors
+*~
+*.swp
-- 
cgit v1.2.3


From b70a5a18d3091fd61c41391a937e9fef1b27cf9c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 21:54:54 -0800
Subject: pipenv: explicit regex dependency

regex, unlike stdlib 're' module, has unicode support.

I couldn't get pipenv to lock after adding this dependency, even though
Pipfile.lock already includes regex as a sub-dependency of something
else.
---
 Pipfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Pipfile b/Pipfile
index c3606c8..39dedee 100644
--- a/Pipfile
+++ b/Pipfile
@@ -23,6 +23,7 @@ pydantic = "*"
 tokenizers = "*"
 spacy = "*"
 nltk = "*"
+regex = "*"
 
 [requires]
 python_version = "3.7"
-- 
cgit v1.2.3


From d299104f3485c7e99738b521a78adc7e5452f995 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 21:58:20 -0800
Subject: cluster: count empty keys (and don't return them)

---
 fuzzycat/cluster.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index dde0688..9a8d5db 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -272,6 +272,9 @@ class Cluster:
                 except (KeyError, ValueError):
                     counter["key_extraction_failed"] += 1
                     continue
+                if not key:
+                    counter["key_empty"] += 1
+                    continue
                 if self.key_denylist and key in self.key_denylist:
                     counter["key_denylist"] += 1
                     continue
-- 
cgit v1.2.3


From 7dba5c40ad36515f96ccfd44624d508a91d00da0 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 21:58:54 -0800
Subject: add sandcrawler-style title key method

---
 fuzzycat/cluster.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fuzzycat/main.py    |   9 ++--
 2 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 9a8d5db..23aebbb 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -69,15 +69,18 @@ import string
 import subprocess
 import sys
 import tempfile
+import unicodedata
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
 
 import fuzzy
+import regex
 from pydantic import BaseModel
 
 __all__ = [
     "release_key_title",
     "release_key_title_normalized",
     "release_key_title_nysiis",
+    "release_key_title_sandcrawler",
     "sort_by_column",
     "group_by",
     "Cluster",
@@ -103,6 +106,7 @@ class KeyDoc(BaseModel):
     title: Optional[str]
     contribs: Optional[List[Contrib]]
 
+
 class ClusterResult(BaseModel):
     """
     Result of clustering.
@@ -155,6 +159,128 @@ def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
     return (ident, fuzzy.nysiis(title))
 
 
+# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
+SANDCRAWLER_CHAR_MAP = {
+    '\N{Latin capital letter AE}': 'AE',
+    '\N{Latin small letter ae}': 'ae',
+    '\N{Latin capital letter Eth}': 'D',
+    '\N{Latin small letter eth}': 'd',
+    '\N{Latin capital letter O with stroke}': 'O',
+    '\N{Latin small letter o with stroke}': 'o',
+    '\N{Latin capital letter Thorn}': 'Th',
+    '\N{Latin small letter thorn}': 'th',
+    '\N{Latin small letter sharp s}': 's',
+    '\N{Latin capital letter D with stroke}': 'D',
+    '\N{Latin small letter d with stroke}': 'd',
+    '\N{Latin capital letter H with stroke}': 'H',
+    '\N{Latin small letter h with stroke}': 'h',
+    '\N{Latin small letter dotless i}': 'i',
+    '\N{Latin small letter kra}': 'k',
+    '\N{Latin capital letter L with stroke}': 'L',
+    '\N{Latin small letter l with stroke}': 'l',
+    '\N{Latin capital letter Eng}': 'N',
+    '\N{Latin small letter eng}': 'n',
+    '\N{Latin capital ligature OE}': 'Oe',
+    '\N{Latin small ligature oe}': 'oe',
+    '\N{Latin capital letter T with stroke}': 'T',
+    '\N{Latin small letter t with stroke}': 't',
+
+    # bnewbold additions
+    'μ': 'u',
+    '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+}
+
+SANDCRAWLER_PREFIX_REMOVE = [
+    "original article: ",
+    "original article ",
+    "article: ",
+    "title: ",
+]
+
+# regex that matches all characters which should be removed
+SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
+    r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]")
+
+
+def sandcrawler_slugify(raw: str) -> str:
+    """
+    Python re-implementation of sandcrawler Scala code for string comparison
+    ("scorable" strings)
+    """
+    slug = raw.strip().lower()
+
+    # transforms before running regex
+    for prefix in SANDCRAWLER_PREFIX_REMOVE:
+        if slug.startswith(prefix):
+            slug = slug[:len(prefix)]
+
+    slug = slug.replace("&apos;", "'")
+
+    # iterate over all chars and replace from map, if in map; then lower-case again
+    slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug])
+
+    # early bailout before executing regex
+    if not slug:
+        return ""
+
+    slug = unicodedata.normalize('NFKD', slug)
+    slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
+
+    return slug.lower()
+
+
+def test_sandcrawler_slugify() -> None:
+    test_cases = [
+        ("", ""),
+        ("asdf", "asdf"),
+        ("'Hello World!'", "helloworld"),
+        ("ASDF", "asdf"),
+        ("as\n  df", "asdf"),
+        ("as\u0142  bb \u00f8", "aslbbo"),
+        ("`hello¿", "hello"),
+        ("علمية", "علمية"),
+        ("期刊的数字", "期刊的数字"),
+        ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+        ("μmeter", "umeter"),
+        # TODO: ("salt &and; pepper", "saltpepper"),
+        # TODO: ("new <b>and</b> improved", "newandimproved"),
+
+        # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+        ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="),
+        ("⁰⁴⁵₀₁₂", "045012"),
+        ("社會科學院語學研究所", "社會科學院語學研究所"),
+        # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
+        # TODO: ("表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀", "表ポあa鷗oeebＢ逍usaan丂㐀𠀀"),
+        ("( ͡° ͜ʖ ͡°)", ""),
+        # emoji ok? I guess
+        ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+        ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+        ("﷽ ", "﷽"),
+        ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+         "thenezperdianhivemindofchaoszalgo"),
+        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+    ]
+
+    for in_str, out_str in test_cases:
+        if sandcrawler_slugify(in_str) != out_str:
+            for c in list(sandcrawler_slugify(in_str)):
+                print(unicodedata.name(c))
+                #print(ord(c))
+            print("----")
+            for c in list(out_str):
+                print(unicodedata.name(c))
+            print(in_str)
+        assert sandcrawler_slugify(in_str) == out_str
+
+
+def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]:
+    ident, title = release_key_title(doc)
+    slug = sandcrawler_slugify(title)
+    return (ident, slug)
+
+
 def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
     """
     Derive a key from title and authors. Authors in contribs list:
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index bfce68e..9216808 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -12,18 +12,20 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
 
 import argparse
 import cProfile as profile
+import fileinput
 import io
 import logging
 import pstats
 import sys
 import tempfile
-import fileinput
 
 import orjson as json
 
 from fuzzycat.build import NgramLookup, TitleTokenList
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
-                              release_key_title_nysiis, release_key_title_ngram)
+from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
+                              release_key_title_normalized, release_key_title_nysiis,
+                              release_key_title_sandcrawler)
+
 
 def run_cluster(args):
     logger = logging.getLogger('main.run_cluster')
@@ -32,6 +34,7 @@ def run_cluster(args):
         'tnorm': release_key_title_normalized,
         'tnysi': release_key_title_nysiis,
         'tss': release_key_title_ngram,
+        'tsandcrawler': release_key_title_sandcrawler,
     }
     key_denylist = None
     if args.key_denylist:
-- 
cgit v1.2.3


From 30eab70787584a333714b18f1d64f362e4768730 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Nov 2020 22:58:18 -0800
Subject: sandcrawler slugify: yet more unicode corner-cases

---
 fuzzycat/cluster.py | 63 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 23aebbb..289fd30 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -186,8 +186,17 @@ SANDCRAWLER_CHAR_MAP = {
     '\N{Latin small letter t with stroke}': 't',
 
     # bnewbold additions
-    'μ': 'u',
+    '\N{MICRO SIGN}': 'u',
+    '\N{LATIN SMALL LETTER C}': 'c',
+    '\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
+    # bnewbold map-to-null (for non-printing stuff not in the regex)
+    '\N{PARTIAL DIFFERENTIAL}': '',
     '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+    '\N{N-ARY SUMMATION}': '',
+    '\N{N-ARY PRODUCT}': '',
+    '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '',
+    '\N{SNOWMAN}': '',
+    '\N{CARON}': '',
 }
 
 SANDCRAWLER_PREFIX_REMOVE = [
@@ -199,7 +208,8 @@ SANDCRAWLER_PREFIX_REMOVE = [
 
 # regex that matches all characters which should be removed
 SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
-    r"[\s\p{Punct}\p{M}\p{InCombiningDiacriticalMarks}’·“”‘’“”«»「」¿–±§_`°ʖ©®¤]")
+    r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
+)
 
 
 def sandcrawler_slugify(raw: str) -> str:
@@ -217,7 +227,7 @@ def sandcrawler_slugify(raw: str) -> str:
     slug = slug.replace("&apos;", "'")
 
     # iterate over all chars and replace from map, if in map; then lower-case again
-    slug = ''.join([(c in SANDCRAWLER_CHAR_MAP and SANDCRAWLER_CHAR_MAP[c]) or c for c in slug])
+    slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug])
 
     # early bailout before executing regex
     if not slug:
@@ -241,32 +251,53 @@ def test_sandcrawler_slugify() -> None:
         ("علمية", "علمية"),
         ("期刊的数字", "期刊的数字"),
         ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
-        ("μmeter", "umeter"),
+
+        # "MICRO SIGN"
+        ("\xb5meter", "umeter"),
+        # "GREEK SMALL LETTER MU"
+        ("\u03bcmeter", "\u03bcmeter"),
+
         # TODO: ("salt &and; pepper", "saltpepper"),
         # TODO: ("new <b>and</b> improved", "newandimproved"),
 
         # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
-        ("¡™£¢∞§¶•ªº–≠ ", "tm£¢∞ao="),
-        ("⁰⁴⁵₀₁₂", "045012"),
-        ("社會科學院語學研究所", "社會科學院語學研究所"),
+        ("-9223372036854775808/-1", "92233720368547758081"),
+        (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""),
+        (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000",
+         ""),
+        (r"Ω≈ç√∫˜≤≥÷", "ωc"),
+        (r"åß∂ƒ©˙∆˚¬…æ", "asfae"),
+        (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"),
+        (r"¡™£¢∞§¶•ªº–≠ ", "tmao"),
+        (r"¸˛Ç◊ı˜Â¯˘¿", "cia"),
+        (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"),
+        (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"),
+        (r"`⁄€‹›ﬁﬂ‡°·‚—±", "fifl"),
+        (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
+         "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"),
+        (r"⁰⁴⁵₀₁₂", "045012"),
+        (r"社會科學院語學研究所", "社會科學院語學研究所"),
         # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
         # TODO: ("表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀", "表ポあa鷗oeebＢ逍usaan丂㐀𠀀"),
-        ("( ͡° ͜ʖ ͡°)", ""),
+        (r"( ͡° ͜ʖ ͡°)", ""),
         # emoji ok? I guess
-        ("👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
-        ("2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
-        ("﷽ ", "﷽"),
-        ("̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+        (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+        (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+        (r"﷽ ", "﷽"),
+        (r"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
          "thenezperdianhivemindofchaoszalgo"),
-        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
-        ("Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
-        ("𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        (r"Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ", "thequickbrownfoxjumpsoverthelazydog"),
+        (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
     ]
 
     for in_str, out_str in test_cases:
         if sandcrawler_slugify(in_str) != out_str:
             for c in list(sandcrawler_slugify(in_str)):
-                print(unicodedata.name(c))
+                try:
+                    print(unicodedata.name(c))
+                except ValueError:
+                    print(ord(c))
                 #print(ord(c))
             print("----")
             for c in list(out_str):
-- 
cgit v1.2.3