diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 20:23:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 20:23:39 -0700 |
commit | 3c42a789d121445fdc7608bc642129189bee07f5 (patch) | |
tree | 3c7fbf1c2ea02fda56bd2910dd79b1bbd2aee800 /scalding/src/main/scala | |
parent | 4ca3d5088520d219eccbc5921928c5b67d8e998a (diff) | |
download | sandcrawler-3c42a789d121445fdc7608bc642129189bee07f5.tar.gz sandcrawler-3c42a789d121445fdc7608bc642129189bee07f5.zip |
comment about possible slugification process
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/StringUtilities.scala | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index b6e5554..6eeff7e 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -4,6 +4,15 @@ import java.text.Normalizer import java.util.regex.Pattern object StringUtilities { + // bnewbold: I propose that we: + // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit} + // 2. strip accents + // 3. "lower-case" (unicode-aware) + // 4. do any final custom/manual mappings + // + // We should check (test) that null bytes are handled, in addition to other + // more obvious characters + // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934 def removeAccents(s : String) : String = { val replacements = Map( |