1 files changed, 44 insertions, 5 deletions
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index a59b278..f52c5b4 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -1,13 +1,17 @@
 package sandcrawler
 
 import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
-class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
+class HBaseCrossrefScoreTest extends FunSpec with TupleConversions {
   val GrobidString = """
 {
-  "title": "Dummy Example File",
+  "title": "<<TITLE>>",
   "authors": [
     {"name": "Brewster Kahle"},
     {"name": "J Doe"}
@@ -50,6 +54,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
+  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
   val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
   val MalformedGrobidString = GrobidString.replace("}", "")
 
@@ -69,7 +74,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
                                 "delay-in-days" : 0, "content-version" : "tdm" }],
   "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
   "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
-  "DOI" : "10.1016/0987-7983(96)87729-2", 
+  "DOI" : "<<DOI>>",
   "type" : "journal-article", 
   "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
     "date-time" : "2002-07-25T15:09:41Z", 
@@ -77,7 +82,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
   "page" : "186-187", 
   "source" : "Crossref", 
   "is-referenced-by-count" : 0, 
-  "title" : [ "les ferments lactiques: classification, propriÃ©tÃ©s, utilisations agroalimentaires" ], 
+  "title" : [ "<<TITLE>>" ],
   "prefix" : "10.1016", 
   "volume" : "9", 
   "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
@@ -105,9 +110,10 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
-
+/*
   "titleToSlug()" should "extract the parts of titles before a colon" in {
     val slug = HBaseCrossrefScore.titleToSlug("HELLO:there")
     slug should contain ("hello")
@@ -147,4 +153,37 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
     val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString)
      slug shouldBe None
   }
+ */
+  
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val grobidSampleData = List(
+    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))),
+    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))),
+    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))),
+    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title4"))))
+
+  JobTest("sandcrawler.HBaseCrossrefScoreJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
+      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .source(TextLine(input), List((
+      "0" -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+      "1" -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))))
+    .sink[Tuple](TypedTsv[(String, String, String)](output)) {
+      outputBuffer =>
+      it("should return a 2-element list.") {
+        assert(outputBuffer.size === 2)
+      }
+    }
+    .run
+    .finish
 }