aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
commit71b8d527da73f99ffb1b09ec1044031e772d1db6 (patch)
tree1bf8d713130fec986dbb084f542f143f0005ab62 /scalding/src/main
parent8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (diff)
downloadsandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.tar.gz
sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.zip
Added punctuation removal to slug creation and similarity comparisons
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala3
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala8
2 files changed, 9 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 77bb7ae..736c175 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -45,7 +45,8 @@ object Scorable {
}
def titleToSlug(title : String) : String = {
- val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase()
+ val slug = StringUtilities.removePunctuation(
+ StringUtilities.removeAccents(title).split(":")(0).toLowerCase())
if (slug.isEmpty) {
NoSlug
} else {
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 1ae6db3..3058f15 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -25,9 +25,15 @@ object StringUtilities {
pattern.matcher(sb).replaceAll("")
}
+ // Source: https://stackoverflow.com/a/30076541/631051
+ def removePunctuation(s: String) : String = {
+ s.replaceAll("""[\p{Punct}&&[^.]]""", "")
+ }
+
// Adapted from: https://stackoverflow.com/a/16018452/631051
def similarity(s1a : String, s2a : String) : Double = {
- val (s1, s2) = (removeAccents(s1a), removeAccents(s2a))
+ val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+ removeAccents(removePunctuation(s2a)))
val longer : String = if (s1.length > s2.length) s1 else s2
val shorter : String = if (s1.length > s2.length) s2 else s1
if (longer.length == 0) {