diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-30 11:55:19 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-30 11:55:19 -0700 | 
| commit | 81dbd0e05653682dccb8bc74b39067b4ee7ac1f2 (patch) | |
| tree | 657118763cc81f1f1ae5538bed1b18c8d82f8f6f /scalding/src/test | |
| parent | dd0df0fe3574352011d6a0fe3c12e59b0a4b8259 (diff) | |
| download | sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.tar.gz sandcrawler-81dbd0e05653682dccb8bc74b39067b4ee7ac1f2.zip | |
Changed scoring, including adding code to compute string differences. Turned off line length checking.
New scores:
['(583,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0,'title 1','title 1: tng')']
['(500,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0.5,'title 1','title 1: tng 2')']
['(500,sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q,DOI-0.75,'title 1','title 1: tng 3')']
['(588,sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU,DOI-1,'title 2: tng','title 2: rebooted')']
Diffstat (limited to 'scalding/src/test')
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 49 | 
1 files changed, 48 insertions, 1 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 655dda1..e6ff4a8 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -188,6 +188,53 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {      HBaseCrossrefScore.removeAccents("SØREN") shouldBe "SOREN"    } +  // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/ +  "stringDistance" should "work on empty strings" in { +    HBaseCrossrefScore.stringDistance("", "") shouldBe 0 +    HBaseCrossrefScore.stringDistance("a", "") shouldBe 1 +    HBaseCrossrefScore.stringDistance("", "a") shouldBe 1 +    HBaseCrossrefScore.stringDistance("abc", "") shouldBe 3 +    HBaseCrossrefScore.stringDistance("", "abc") shouldBe 3 +  } + +  it should "work on equal strings" in { +    HBaseCrossrefScore.stringDistance("", "") shouldBe 0 +    HBaseCrossrefScore.stringDistance("a", "a") shouldBe 0 +    HBaseCrossrefScore.stringDistance("abc", "abc") shouldBe 0 +  } + +  it should "work where only inserts are needed" in { +    HBaseCrossrefScore.stringDistance("", "a") shouldBe 1 +    HBaseCrossrefScore.stringDistance("a", "ab") shouldBe 1 +    HBaseCrossrefScore.stringDistance("b", "ab") shouldBe 1 +    HBaseCrossrefScore.stringDistance("ac", "abc") shouldBe 1 +    HBaseCrossrefScore.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6 +  } + +  it should "work where only deletes are needed" in { +    HBaseCrossrefScore.stringDistance( "a", "") shouldBe 1 +    HBaseCrossrefScore.stringDistance( "ab", "a") shouldBe 1 +    HBaseCrossrefScore.stringDistance( "ab", "b") shouldBe 1 +    HBaseCrossrefScore.stringDistance("abc", "ac") shouldBe 1 +    HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6 +  } + +  it should "work where only substitutions are needed" in { +    HBaseCrossrefScore.stringDistance(  "a",   "b") shouldBe 1 +    HBaseCrossrefScore.stringDistance( "ab",  "ac") shouldBe 1 +    HBaseCrossrefScore.stringDistance( "ac",  "bc") shouldBe 1 +    HBaseCrossrefScore.stringDistance("abc", "axc") shouldBe 1 +    HBaseCrossrefScore.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6 +  } + +  it should "work where many operations are needed" in { +    HBaseCrossrefScore.stringDistance("example", "samples") shouldBe 3 +    HBaseCrossrefScore.stringDistance("sturgeon", "urgently") shouldBe 6 +    HBaseCrossrefScore.stringDistance("levenshtein", "frankenstein") shouldBe 6 +    HBaseCrossrefScore.stringDistance("distance", "difference") shouldBe 5 +    HBaseCrossrefScore.stringDistance("java was neat", "scala is great") shouldBe 7 +  } +    //  Pipeline tests    val output = "/tmp/testOutput" @@ -227,7 +274,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {        //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"        // Join should have 3 "Title  1" slugs and 1 "Title 2" slug        outputBuffer => -      it should "return a 4-element list" in { +      "The pipeline" should "return a 4-element list" in {          outputBuffer should have length 4        } | 
