From 7eed53615e3a106d1cbf7cc451b74674fd2c3daa Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 09:56:19 -0700 Subject: Added StringUtilitiesTest.scala, which passes. --- .../scala/sandcrawler/StringUtilitiesTest.scala | 75 ++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala (limited to 'scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala') diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala new file mode 100644 index 0000000..2df5a22 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -0,0 +1,75 @@ +package sandcrawler + +import org.scalatest._ + +class StringUtilitiesTest extends FlatSpec with Matchers { + "removeAccents()" should "handle the empty string" in { + StringUtilities.removeAccents("") shouldBe "" + } + + it should "not change a string with unaccented characters" in { + StringUtilities.removeAccents("abc123") shouldBe "abc123" + } + + it should "remove accents from Ls" in { + StringUtilities.removeAccents("E\u0141\u0142en") shouldBe "ELlen" + } + + it should "remove accents from Es without changing case" in { + val result = StringUtilities.removeAccents("\u00e9") + result should have length 1 + result shouldBe "e" + } + + it should "convert the ø in Soren" in { + StringUtilities.removeAccents("Søren") shouldBe "Soren" + StringUtilities.removeAccents("SØREN") shouldBe "SOREN" + } + + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ + "stringDistance" should "work on empty strings" in { + StringUtilities.stringDistance("", "") shouldBe 0 + StringUtilities.stringDistance("a", "") shouldBe 1 + StringUtilities.stringDistance("", "a") shouldBe 1 + StringUtilities.stringDistance("abc", "") shouldBe 3 + StringUtilities.stringDistance("", "abc") shouldBe 3 + } + + it should "work on equal strings" in { + StringUtilities.stringDistance("", "") shouldBe 0 + StringUtilities.stringDistance("a", "a") shouldBe 0 + StringUtilities.stringDistance("abc", "abc") shouldBe 0 + } + + it should "work where only inserts are needed" in { + StringUtilities.stringDistance("", "a") shouldBe 1 + StringUtilities.stringDistance("a", "ab") shouldBe 1 + StringUtilities.stringDistance("b", "ab") shouldBe 1 + StringUtilities.stringDistance("ac", "abc") shouldBe 1 + StringUtilities.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6 + } + + it should "work where only deletes are needed" in { + StringUtilities.stringDistance( "a", "") shouldBe 1 + StringUtilities.stringDistance( "ab", "a") shouldBe 1 + StringUtilities.stringDistance( "ab", "b") shouldBe 1 + StringUtilities.stringDistance("abc", "ac") shouldBe 1 + StringUtilities.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6 + } + + it should "work where only substitutions are needed" in { + StringUtilities.stringDistance( "a", "b") shouldBe 1 + StringUtilities.stringDistance( "ab", "ac") shouldBe 1 + StringUtilities.stringDistance( "ac", "bc") shouldBe 1 + StringUtilities.stringDistance("abc", "axc") shouldBe 1 + StringUtilities.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6 + } + + it should "work where many operations are needed" in { + StringUtilities.stringDistance("example", "samples") shouldBe 3 + StringUtilities.stringDistance("sturgeon", "urgently") shouldBe 6 + StringUtilities.stringDistance("levenshtein", "frankenstein") shouldBe 6 + StringUtilities.stringDistance("distance", "difference") shouldBe 5 + StringUtilities.stringDistance("java was neat", "scala is great") shouldBe 7 + } +} -- cgit v1.2.3 From 71b8d527da73f99ffb1b09ec1044031e772d1db6 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Tue, 7 Aug 2018 11:24:06 -0700 Subject: Added punctuation removal to slug creation and similarity comparisons --- scalding/src/main/scala/sandcrawler/Scorable.scala | 3 ++- scalding/src/main/scala/sandcrawler/StringUtilities.scala | 8 +++++++- scalding/src/test/scala/sandcrawler/ScorableTest.scala | 7 +++++++ scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 10 ++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala') diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 77bb7ae..736c175 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -45,7 +45,8 @@ object Scorable { } def titleToSlug(title : String) : String = { - val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase() + val slug = StringUtilities.removePunctuation( + StringUtilities.removeAccents(title).split(":")(0).toLowerCase()) if (slug.isEmpty) { NoSlug } else { diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 1ae6db3..3058f15 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -25,9 +25,15 @@ object StringUtilities { pattern.matcher(sb).replaceAll("") } + // Source: https://stackoverflow.com/a/30076541/631051 + def removePunctuation(s: String) : String = { + s.replaceAll("""[\p{Punct}&&[^.]]""", "") + } + // Adapted from: https://stackoverflow.com/a/16018452/631051 def similarity(s1a : String, s2a : String) : Double = { - val (s1, s2) = (removeAccents(s1a), removeAccents(s2a)) + val (s1, s2) = (removeAccents(removePunctuation(s1a)), + removeAccents(removePunctuation(s2a))) val longer : String = if (s1.length > s2.length) s1 else s2 val shorter : String = if (s1.length > s2.length) s2 else s1 if (longer.length == 0) { diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 8445073..713a7e5 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers { Scorable.titleToSlug("") shouldBe Scorable.NoSlug } + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } + "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala index 2df5a22..410819b 100644 --- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers { StringUtilities.removeAccents("SØREN") shouldBe "SOREN" } + "removePunctuation" should "work on the empty string" in { + StringUtilities.removePunctuation("") shouldBe "" + } + + it should "work on non-empty text strings" in { + StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world" + StringUtilities.removePunctuation(":-)") shouldBe "" + StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab" + } + // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ "stringDistance" should "work on empty strings" in { StringUtilities.stringDistance("", "") shouldBe 0 -- cgit v1.2.3