diff options
Diffstat (limited to 'scalding/src/test/scala')
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 31 | 
1 files changed, 22 insertions, 9 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index bd9dcd3..e6211a2 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -163,10 +163,14 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {    val (testTable, testHost) = ("test-table", "dummy-host:2181")    val grobidSampleData = List( -    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))), -    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))), -    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))), -    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString))) +    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), +    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), +    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), +    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),  +      Bytes.toBytes(MalformedGrobidString)))    JobTest("sandcrawler.HBaseCrossrefScoreJob")      .arg("test", "") @@ -180,18 +184,27 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {        grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))      .source(TextLine(input), List(        0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), -      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.5"), -      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.75"), +      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), +      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),        3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) -    .sink[String](TypedTsv[String](output)) { +    .sink[(Int, String, String, String, String)](TypedTsv[(Int, +    String, String, String, String)](output)) { +      // Grobid titles:  +      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel" +      // crossref slugs:  +      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" +      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug        outputBuffer =>        it should "return a 4-element list" in { -        outputBuffer should have length 3 +        outputBuffer should have length 4        } +        /*        it should "return the right first entry" in {          val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0) -        slug shouldBe "title1" +        slug shouldBe "title 1" +        slug shouldBe slug0 +        slug shouldBe slug1          sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")          grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")        } | 
