From 71b8d527da73f99ffb1b09ec1044031e772d1db6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 11:24:06 -0700 Subject: Added punctuation removal to slug creation and similarity comparisons --- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 7 +++++++ scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'scalding/src/test') diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 8445073..713a7e5 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } + "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala index 2df5a22..410819b 100644 --- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers { StringUtilities.removeAccents("SØREN") shouldBe "SOREN" } + "removePunctuation" should "work on the empty string" in { + StringUtilities.removePunctuation("") shouldBe "" + } + + it should "work on non-empty text strings" in { + StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world" + StringUtilities.removePunctuation(":-)") shouldBe "" + StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab" + } + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ "stringDistance" should "work on empty strings" in { StringUtilities.stringDistance("", "") shouldBe 0 -- cgit v1.2.3