diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:24:06 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:24:06 -0700 |
commit | 71b8d527da73f99ffb1b09ec1044031e772d1db6 (patch) | |
tree | 1bf8d713130fec986dbb084f542f143f0005ab62 /scalding/src/test | |
parent | 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (diff) | |
download | sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.tar.gz sandcrawler-71b8d527da73f99ffb1b09ec1044031e772d1db6.zip |
Added punctuation removal to slug creation and similarity comparisons
Diffstat (limited to 'scalding/src/test')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableTest.scala | 7 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 10 |
2 files changed, 17 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 8445073..713a7e5 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } + "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala index 2df5a22..410819b 100644 --- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers { StringUtilities.removeAccents("SØREN") shouldBe "SOREN" } + "removePunctuation" should "work on the empty string" in { + StringUtilities.removePunctuation("") shouldBe "" + } + + it should "work on non-empty text strings" in { + StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world" + StringUtilities.removePunctuation(":-)") shouldBe "" + StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab" + } + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ "stringDistance" should "work on empty strings" in { StringUtilities.stringDistance("", "") shouldBe 0 |