diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 09:56:19 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 09:56:19 -0700 | 
| commit | 7eed53615e3a106d1cbf7cc451b74674fd2c3daa (patch) | |
| tree | aae47e30d973317e3ea68c37a10089e3af2e22cd /scalding | |
| parent | 713b8316d9170ec595f71d4f27df8d3184350921 (diff) | |
| download | sandcrawler-7eed53615e3a106d1cbf7cc451b74674fd2c3daa.tar.gz sandcrawler-7eed53615e3a106d1cbf7cc451b74674fd2c3daa.zip | |
Added StringUtilitiesTest.scala, which passes.
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 75 | 
1 files changed, 75 insertions, 0 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala new file mode 100644 index 0000000..2df5a22 --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala @@ -0,0 +1,75 @@ +package sandcrawler + +import org.scalatest._ + +class StringUtilitiesTest extends FlatSpec with Matchers { +  "removeAccents()" should "handle the empty string" in { +    StringUtilities.removeAccents("") shouldBe "" +  } + +  it should "not change a string with unaccented characters" in { +    StringUtilities.removeAccents("abc123") shouldBe "abc123" +  } + +  it should "remove accents from Ls" in { +    StringUtilities.removeAccents("E\u0141\u0142en") shouldBe "ELlen" +  } + +  it should "remove accents from Es without changing case" in { +    val result = StringUtilities.removeAccents("\u00e9") +    result should have length 1 +    result shouldBe "e" +  } + +  it should "convert the ø in Soren" in { +    StringUtilities.removeAccents("Søren") shouldBe "Soren" +    StringUtilities.removeAccents("SØREN") shouldBe "SOREN" +  } + +  // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ +  "stringDistance" should "work on empty strings" in { +    StringUtilities.stringDistance("", "") shouldBe 0 +    StringUtilities.stringDistance("a", "") shouldBe 1 +    StringUtilities.stringDistance("", "a") shouldBe 1 +    StringUtilities.stringDistance("abc", "") shouldBe 3 +    StringUtilities.stringDistance("", "abc") shouldBe 3 +  } + +  it should "work on equal strings" in { +    StringUtilities.stringDistance("", "") shouldBe 0 +    StringUtilities.stringDistance("a", "a") shouldBe 0 +    StringUtilities.stringDistance("abc", "abc") shouldBe 0 +  } + +  it should "work where only inserts are needed" in { +    StringUtilities.stringDistance("", "a") shouldBe 1 +    StringUtilities.stringDistance("a", "ab") shouldBe 1 +    StringUtilities.stringDistance("b", "ab") shouldBe 1 +    StringUtilities.stringDistance("ac", "abc") shouldBe 1 +    StringUtilities.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6 +  } + +  it should "work where only deletes are needed" in { +    StringUtilities.stringDistance( "a", "") shouldBe 1 +    StringUtilities.stringDistance( "ab", "a") shouldBe 1 +    StringUtilities.stringDistance( "ab", "b") shouldBe 1 +    StringUtilities.stringDistance("abc", "ac") shouldBe 1 +    StringUtilities.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6 +  } + +  it should "work where only substitutions are needed" in { +    StringUtilities.stringDistance(  "a",   "b") shouldBe 1 +    StringUtilities.stringDistance( "ab",  "ac") shouldBe 1 +    StringUtilities.stringDistance( "ac",  "bc") shouldBe 1 +    StringUtilities.stringDistance("abc", "axc") shouldBe 1 +    StringUtilities.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6 +  } + +  it should "work where many operations are needed" in { +    StringUtilities.stringDistance("example", "samples") shouldBe 3 +    StringUtilities.stringDistance("sturgeon", "urgently") shouldBe 6 +    StringUtilities.stringDistance("levenshtein", "frankenstein") shouldBe 6 +    StringUtilities.stringDistance("distance", "difference") shouldBe 5 +    StringUtilities.stringDistance("java was neat", "scala is great") shouldBe 7 +  } +} | 
