aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:24:06 -0700
commit71b8d527da73f99ffb1b09ec1044031e772d1db6 (patch)
tree1bf8d713130fec986dbb084f542f143f0005ab62 /scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
parent8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (diff)
downloadsandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.tar.gz
sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.zip
Added punctuation removal to slug creation and similarity comparisons
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala10
1 files changed, 10 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
index 2df5a22..410819b 100644
--- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers {
StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
}
+ "removePunctuation" should "work on the empty string" in {
+ StringUtilities.removePunctuation("") shouldBe ""
+ }
+
+ it should "work on non-empty text strings" in {
+ StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+ StringUtilities.removePunctuation(":-)") shouldBe ""
+ StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+ }
+
// Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
"stringDistance" should "work on empty strings" in {
StringUtilities.stringDistance("", "") shouldBe 0