From 6d2bb4787150682236f4c349f8e469026fe3d490 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Thu, 26 Jul 2018 04:36:43 -0700 Subject: Computes and outputs (score, sha1, doi, grobidTitle, crossrefTitle). --- .../scala/sandcrawler/HBaseCrossrefScoreTest.scala | 31 +++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'scalding/src/test/scala') diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index bd9dcd3..e6211a2 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -163,10 +163,14 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { val (testTable, testHost) = ("test-table", "dummy-host:2181") val grobidSampleData = List( - List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<>", "Title1"))), - List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))), - List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))), - List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString))) + List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), + List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), + List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), + Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), + List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), + Bytes.toBytes(MalformedGrobidString))) JobTest("sandcrawler.HBaseCrossrefScoreJob") .arg("test", "") @@ -180,18 +184,27 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) .source(TextLine(input), List( 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), - 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.5"), - 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.75"), + 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), + 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) - .sink[String](TypedTsv[String](output)) { + .sink[(Int, String, String, String, String)](TypedTsv[(Int, + String, String, String, String)](output)) { + // Grobid titles: + // "Title 1", "Title 2: TNG", "Title 3: The Sequel" + // crossref slugs: + // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" + // Join should have 3 "Title 1" slugs and 1 "Title 2" slug outputBuffer => it should "return a 4-element list" in { - outputBuffer should have length 3 + outputBuffer should have length 4 } + /* it should "return the right first entry" in { val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0) - slug shouldBe "title1" + slug shouldBe "title 1" + slug shouldBe slug0 + slug shouldBe slug1 sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8") grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8") } -- cgit v1.2.3