diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-26 04:36:43 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-26 04:36:43 -0700 | 
| commit | 6d2bb4787150682236f4c349f8e469026fe3d490 (patch) | |
| tree | 9c86515c4280c87c2f382d92213a8ef3cd8e18eb /scalding/src/test/scala | |
| parent | 15ae7006cd8238bb9453f27be6aa5388a6002ce8 (diff) | |
| download | sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.tar.gz sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.zip | |
Computes and outputs (score, sha1, doi, grobidTitle, crossrefTitle).
Diffstat (limited to 'scalding/src/test/scala')
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 31 | 
1 files changed, 22 insertions, 9 deletions
| diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index bd9dcd3..e6211a2 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -163,10 +163,14 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {    val (testTable, testHost) = ("test-table", "dummy-host:2181")    val grobidSampleData = List( -    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))), -    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))), -    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))), -    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString))) +    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))), +    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))), +    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), +      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))), +    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),  +      Bytes.toBytes(MalformedGrobidString)))    JobTest("sandcrawler.HBaseCrossrefScoreJob")      .arg("test", "") @@ -180,18 +184,27 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {        grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))      .source(TextLine(input), List(        0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"), -      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.5"), -      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.75"), +      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"), +      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),        3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))) -    .sink[String](TypedTsv[String](output)) { +    .sink[(Int, String, String, String, String)](TypedTsv[(Int, +    String, String, String, String)](output)) { +      // Grobid titles:  +      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel" +      // crossref slugs:  +      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted" +      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug        outputBuffer =>        it should "return a 4-element list" in { -        outputBuffer should have length 3 +        outputBuffer should have length 4        } +        /*        it should "return the right first entry" in {          val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0) -        slug shouldBe "title1" +        slug shouldBe "title 1" +        slug shouldBe slug0 +        slug shouldBe slug1          sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")          grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")        } | 
