aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
commit71b8d527da73f99ffb1b09ec1044031e772d1db6 (patch)
tree1bf8d713130fec986dbb084f542f143f0005ab62 /scalding
parent8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (diff)
downloadsandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.tar.gz
sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.zip
Added punctuation removal to slug creation and similarity comparisons
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala3
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala8
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala7
-rw-r--r--scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala10
4 files changed, 26 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 77bb7ae..736c175 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -45,7 +45,8 @@ object Scorable {
}
def titleToSlug(title : String) : String = {
- val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase()
+ val slug = StringUtilities.removePunctuation(
+ StringUtilities.removeAccents(title).split(":")(0).toLowerCase())
if (slug.isEmpty) {
NoSlug
} else {
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 1ae6db3..3058f15 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -25,9 +25,15 @@ object StringUtilities {
pattern.matcher(sb).replaceAll("")
}
+ // Source: https://stackoverflow.com/a/30076541/631051
+ def removePunctuation(s: String) : String = {
+ s.replaceAll("""[\p{Punct}&&[^.]]""", "")
+ }
+
// Adapted from: https://stackoverflow.com/a/16018452/631051
def similarity(s1a : String, s2a : String) : Double = {
- val (s1, s2) = (removeAccents(s1a), removeAccents(s2a))
+ val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+ removeAccents(removePunctuation(s2a)))
val longer : String = if (s1.length > s2.length) s1 else s2
val shorter : String = if (s1.length > s2.length) s2 else s1
if (longer.length == 0) {
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 8445073..713a7e5 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers {
Scorable.titleToSlug("") shouldBe Scorable.NoSlug
}
+ "titleToSlug()" should "strip punctuation" in {
+ Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
+ Scorable.titleToSlug("a:b:c") shouldBe "a"
+ Scorable.titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+ }
+
"jsonToMap()" should "return a map, given a legal JSON string" in {
Scorable.jsonToMap(JsonString) should not be (None)
}
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
index 2df5a22..410819b 100644
--- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers {
StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
}
+ "removePunctuation" should "work on the empty string" in {
+ StringUtilities.removePunctuation("") shouldBe ""
+ }
+
+ it should "work on non-empty text strings" in {
+ StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+ StringUtilities.removePunctuation(":-)") shouldBe ""
+ StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+ }
+
// Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
"stringDistance" should "work on empty strings" in {
StringUtilities.stringDistance("", "") shouldBe 0