aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
commit71b8d527da73f99ffb1b09ec1044031e772d1db6 (patch)
tree1bf8d713130fec986dbb084f542f143f0005ab62 /scalding/src/test
parent8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (diff)
downloadsandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.tar.gz
sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.zip
Added punctuation removal to slug creation and similarity comparisons
Diffstat (limited to 'scalding/src/test')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala7
-rw-r--r--scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala10
2 files changed, 17 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 8445073..713a7e5 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers {
Scorable.titleToSlug("") shouldBe Scorable.NoSlug
}
+ "titleToSlug()" should "strip punctuation" in {
+ Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
+ Scorable.titleToSlug("a:b:c") shouldBe "a"
+ Scorable.titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+ }
+
"jsonToMap()" should "return a map, given a legal JSON string" in {
Scorable.jsonToMap(JsonString) should not be (None)
}
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
index 2df5a22..410819b 100644
--- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers {
StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
}
+ "removePunctuation" should "work on the empty string" in {
+ StringUtilities.removePunctuation("") shouldBe ""
+ }
+
+ it should "work on non-empty text strings" in {
+ StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+ StringUtilities.removePunctuation(":-)") shouldBe ""
+ StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+ }
+
// Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
"stringDistance" should "work on empty strings" in {
StringUtilities.stringDistance("", "") shouldBe 0