aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala9
1 files changed, 9 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index b6e5554..6eeff7e 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -4,6 +4,15 @@ import java.text.Normalizer
import java.util.regex.Pattern
object StringUtilities {
+ // bnewbold: I propose that we:
+ // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+ // 2. strip accents
+ // 3. "lower-case" (unicode-aware)
+ // 4. do any final custom/manual mappings
+ //
+ // We should check (test) that null bytes are handled, in addition to other
+ // more obvious characters
+
// Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
def removeAccents(s : String) : String = {
val replacements = Map(