diff options
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 3 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/StringUtilities.scala | 8 |
2 files changed, 9 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 77bb7ae..736c175 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -45,7 +45,8 @@ object Scorable { } def titleToSlug(title : String) : String = { - val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase() + val slug = StringUtilities.removePunctuation( + StringUtilities.removeAccents(title).split(":")(0).toLowerCase()) if (slug.isEmpty) { NoSlug } else { diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 1ae6db3..3058f15 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -25,9 +25,15 @@ object StringUtilities { pattern.matcher(sb).replaceAll("") } + // Source: https://stackoverflow.com/a/30076541/631051 + def removePunctuation(s: String) : String = { + s.replaceAll("""[\p{Punct}&&[^.]]""", "") + } + // Adapted from: https://stackoverflow.com/a/16018452/631051 def similarity(s1a : String, s2a : String) : Double = { - val (s1, s2) = (removeAccents(s1a), removeAccents(s2a)) + val (s1, s2) = (removeAccents(removePunctuation(s1a)), + removeAccents(removePunctuation(s2a))) val longer : String = if (s1.length > s2.length) s1 else s2 val shorter : String = if (s1.length > s2.length) s2 else s1 if (longer.length == 0) { |