From 71b8d527da73f99ffb1b09ec1044031e772d1db6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 11:24:06 -0700 Subject: Added punctuation removal to slug creation and similarity comparisons --- scalding/src/main/scala/sandcrawler/Scorable.scala | 3 ++- scalding/src/main/scala/sandcrawler/StringUtilities.scala | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'scalding/src/main/scala/sandcrawler') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 77bb7ae..736c175 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -45,7 +45,8 @@ object Scorable { } def titleToSlug(title : String) : String = { - val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase() + val slug = StringUtilities.removePunctuation( + StringUtilities.removeAccents(title).split(":")(0).toLowerCase()) if (slug.isEmpty) { NoSlug } else { diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 1ae6db3..3058f15 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -25,9 +25,15 @@ object StringUtilities { pattern.matcher(sb).replaceAll("") } + // Source: https://stackoverflow.com/a/30076541/631051 + def removePunctuation(s: String) : String = { + s.replaceAll("""[\p{Punct}&&[^.]]""", "") + } + // Adapted from: https://stackoverflow.com/a/16018452/631051 def similarity(s1a : String, s2a : String) : Double = { - val (s1, s2) = (removeAccents(s1a), removeAccents(s2a)) + val (s1, s2) = (removeAccents(removePunctuation(s1a)), + removeAccents(removePunctuation(s2a))) val longer : String = if (s1.length > s2.length) s1 else s2 val shorter : String = if (s1.length > s2.length) s2 else s1 if (longer.length == 0) { -- cgit v1.2.3