aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 20:23:39 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 20:23:39 -0700
commit3c42a789d121445fdc7608bc642129189bee07f5 (patch)
tree3c7fbf1c2ea02fda56bd2910dd79b1bbd2aee800 /scalding/src/main/scala
parent4ca3d5088520d219eccbc5921928c5b67d8e998a (diff)
downloadsandcrawler-3c42a789d121445fdc7608bc642129189bee07f5.tar.gz
sandcrawler-3c42a789d121445fdc7608bc642129189bee07f5.zip
comment about possible slugification process
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala9
1 files changed, 9 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index b6e5554..6eeff7e 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -4,6 +4,15 @@ import java.text.Normalizer
import java.util.regex.Pattern
object StringUtilities {
+ // bnewbold: I propose that we:
+ // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+ // 2. strip accents
+ // 3. "lower-case" (unicode-aware)
+ // 4. do any final custom/manual mappings
+ //
+ // We should check (test) that null bytes are handled, in addition to other
+ // more obvious characters
+
// Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
def removeAccents(s : String) : String = {
val replacements = Map(