diff options
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableTest.scala | 112 |
1 files changed, 28 insertions, 84 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 40801a0..2f80492 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -9,7 +9,7 @@ import org.scalatest._ import parallelai.spyglass.hbase.HBaseConstants.SourceMode class ScorableTest extends FlatSpec with Matchers { - val JsonString = """ + val JsonString = """ { "title": "<<TITLE>>", "authors": [ @@ -55,96 +55,40 @@ class ScorableTest extends FlatSpec with Matchers { } """ - performUnitTests() - performPipelineTests() - - def performUnitTests() { - "titleToSlug()" should "extract the parts of titles before a colon" in { - Scorable.titleToSlug("HELLO:there") shouldBe "hello" - } - - it should "extract an entire colon-less string" in { - Scorable.titleToSlug("hello THERE") shouldBe "hello there" - } - - it should "return Scorable.NoSlug if given empty string" in { - Scorable.titleToSlug("") shouldBe Scorable.NoSlug - } - - it should "return Scorable.NoSlug if given null" in { - Scorable.titleToSlug(null) shouldBe Scorable.NoSlug - } - - "titleToSlug()" should "strip punctuation" in { - Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" - Scorable.titleToSlug("a:b:c") shouldBe "a" - Scorable.titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" - } + "titleToSlug()" should "extract the parts of titles before a colon" in { + Scorable.titleToSlug("HELLO:there") shouldBe "hello" + } - "jsonToMap()" should "return a map, given a legal JSON string" in { - Scorable.jsonToMap(JsonString) should not be (None) - } + it should "extract an entire colon-less string" in { + Scorable.titleToSlug("hello THERE") shouldBe "hello there" + } - it should "return None, given illegal JSON" in { - Scorable.jsonToMap("illegal{,json{{") should be (None) - } + it should "return Scorable.NoSlug if given empty string" in { + Scorable.titleToSlug("") shouldBe Scorable.NoSlug + } - "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { - val score = Scorable.computeSimilarity( - new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) - score shouldBe Scorable.MaxScore - } + it should "return Scorable.NoSlug if given null" in { + Scorable.titleToSlug(null) shouldBe Scorable.NoSlug } - def performPipelineTests() { - /* + "titleToSlug()" should "strip punctuation" in { + Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello" + Scorable.titleToSlug("a:b:c") shouldBe "a" + Scorable.titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands" + } - val output = "/tmp/testOutput" - val input = "/tmp/testInput" - val (testTable, testHost) = ("test-table", "dummy-host:2181") + "jsonToMap()" should "return a map, given a legal JSON string" in { + Scorable.jsonToMap(JsonString) should not be (None) + } - val grobidSampleData = List( - List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), - List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), - List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), - Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), - List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), - Bytes.toBytes(MalformedGrobidString))) + it should "return None, given illegal JSON" in { + Scorable.jsonToMap("illegal{,json{{") should be (None) + } - JobTest("sandcrawler.HBaseCrossrefScoreJob") - .arg("test", "") - .arg("app.conf.path", "app.conf") - .arg("output", output) - .arg("hbase-table", testTable) - .arg("zookeeper-hosts", testHost) - .arg("crossref-input", input) - .arg("debug", "true") - .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost), - grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) - .source(TextLine(input), List( - 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), - 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), - 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), - 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) - .sink[(Int, String, String, String, String)](TypedTsv[(Int, - String, String, String, String)](output)) { - // Grobid titles: - // "Title 1", "Title 2: TNG", "Title 3: The Sequel" - // crossref slugs: - // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" - // Join should have 3 "Title 1" slugs and 1 "Title 2" slug - outputBuffer => - "The pipeline" should "return a 4-element list" in { - outputBuffer should have length 4 - } - } - .run - .finish -} - */ + "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { + val score = Scorable.computeSimilarity( + new ReduceFeatures(JsonString), new ReduceFeatures(JsonString)) + score shouldBe Scorable.MaxScore } } - |