15 files changed, 1497 insertions, 35 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
new file mode 100644
index 0000000..8302b8f
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -0,0 +1,172 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class CrossrefScorableTest extends FlatSpec with Matchers {
+  // scalastyle:off
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml",
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" },
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain",
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+""".replace("<<DOI>>", "10.123/aBc")
+  // scalastyle:on
+  val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+  val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+  val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+  val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
+  val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+  val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+  val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+  val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+  // Unit tests
+  "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+    CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None)
+  }
+
+  it should "handle missing title" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None)
+  }
+
+  it should "handle null title" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None)
+  }
+
+  it should "handle empty title" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None)
+  }
+
+  it should "handle subtitle" in {
+    CrossrefScorable.jsonToMapFeatures(
+      """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+    }
+  }
+
+  it should "handle empty subtitle" in {
+    CrossrefScorable.jsonToMapFeatures(
+      """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshort"
+    }
+  }
+
+  it should "handle null subtitle" in {
+    CrossrefScorable.jsonToMapFeatures(
+      """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshort"
+    }
+  }
+
+  it should "handle missing authors" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None)
+  }
+
+  it should "handle valid input" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match {
+      case None => fail()
+      case Some(result) => {
+        result.slug shouldBe "sometitle"
+        Scorable.jsonToMap(result.json) match {
+          case None => fail()
+          case Some(map) => {
+            map("title").asInstanceOf[String] shouldBe "Some Title"
+            map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+            // TODO: full name? not just a string?
+            map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+            map("year").asInstanceOf[Double].toInt shouldBe 2002
+          }
+        }
+      }
+    }
+  }
+
+  "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+  }
+
+  it should "return true for valid JSON with a title of maximum permitted length" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+  }
+
+  it should "return false for valid JSON with excessively long title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with null title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with no title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+  }
+
+  it should "return false for invalid JSON" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+  }
+
+  it should "handle content types" in {
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None)
+    CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None)
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
new file mode 100644
index 0000000..8dda5c8
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
@@ -0,0 +1,72 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
+import scala._
+
+@RunWith(classOf[JUnitRunner])
+class DumpUnGrobidedJobTest extends FunSpec with TupleConversions {
+
+  val output = "/tmp/testOutput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val log = LoggerFactory.getLogger(this.getClass.getName)
+
+  val statusCode: Long = 200
+  val statusBytes = Bytes.toBytes(statusCode)
+
+  val sampleDataGrobid : List[List[Array[Byte]]] = List(
+    ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusBytes),
+    ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusBytes),
+    ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusBytes),
+    ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusBytes),
+    ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusBytes),
+    ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusBytes))
+      .map(pair => List(Bytes.toBytes(pair._1), pair._2))
+
+  val sampleDataFile : List[List[Array[Byte]]] = List(
+    ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""),
+    ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", """{c-json-data}""", "application/pdf", """{cdx-json-data}"""))
+      .map(pair => List(Bytes.toBytes(pair._1),
+                        Bytes.toBytes(pair._2),
+                        Bytes.toBytes(pair._3),
+                        Bytes.toBytes(pair._4)))
+
+  JobTest("sandcrawler.DumpUnGrobidedJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("debug", "true")
+    .source[Tuple](DumpUnGrobidedJob.getHBaseColSource(testTable, testHost),
+      sampleDataGrobid.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .source[Tuple](DumpUnGrobidedJob.getHBaseKeySource(testTable, testHost),
+      sampleDataFile.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .sink[Tuple](TypedTsv[(String,String,String,String)](output)) {
+      outputBuffer =>
+      it("should return correct-length list.") {
+        assert(outputBuffer.size === 2)
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
new file mode 100644
index 0000000..823e14a
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
@@ -0,0 +1,160 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class FatcatScorableTest extends FlatSpec with Matchers {
+  // scalastyle:off
+  val FatcatString =
+"""
+{
+  "abstracts": [],
+  "refs": [],
+  "contribs": [
+    {
+      "index": 0,
+      "raw_name": "W Gaier",
+      "surname": "Gaier",
+      "role": "author",
+      "extra": {
+        "seq": "first"
+      }
+    }
+  ],
+  "publisher": "Elsevier BV",
+  "pages": "186-187",
+  "ext_ids": {
+    "doi": "<<DOI>>"
+  },
+  "release_year": 1996,
+  "release_stage": "published",
+  "release_type": "article-journal",
+  "container_id": "3nccslsn5jez3ixrp5skjyjxu4",
+  "title": "<<TITLE>>",
+  "state": "active",
+  "ident": "pnri57u66ffytigdmyybbmouni",
+  "work_id": "tdmqnfzm2nggrhfwzasyegvpyu",
+  "revision": "e50bd04e-d0d4-4ee7-b7a4-6b4f079de154",
+  "extra": {
+    "crossref": {
+      "alternative-id": [
+        "0987-7983(96)87729-2"
+      ],
+      "type": "journal-article"
+    }
+  }
+}
+""".replace("<<DOI>>", "10.123/aBc")
+  // scalastyle:on
+  val FatcatStringWithGoodTitle = FatcatString.replace("<<TITLE>>", "Some Title")
+  val FatcatStringWithMaximumTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+  val FatcatStringWithExcessiveTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+  val FatcatStringWithNullTitle = FatcatString.replace("\"<<TITLE>>\"", "null")
+  val FatcatStringWithEmptyTitle = FatcatString.replace("<<TITLE>>", "")
+  val FatcatStringWithoutTitle = FatcatString.replace("title", "nottitle")
+  val MalformedFatcatString = FatcatString.replace("}", "")
+  val FatcatStringWithNoAuthors = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("contribs", "no-contribs")
+  //val FatcatStringWrongType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+  //val FatcatStringNoType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+  // Unit tests
+  "FatcatScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+    FatcatScorable.jsonToMapFeatures(MalformedFatcatString) should be (None)
+  }
+
+  it should "handle missing title" in {
+    FatcatScorable.jsonToMapFeatures(FatcatStringWithoutTitle) should be (None)
+  }
+
+  it should "handle null title" in {
+    FatcatScorable.jsonToMapFeatures(FatcatStringWithNullTitle) should be (None)
+  }
+
+  it should "handle empty title" in {
+    FatcatScorable.jsonToMapFeatures(FatcatStringWithEmptyTitle) should be (None)
+  }
+
+  it should "handle subtitle" in {
+    FatcatScorable.jsonToMapFeatures(
+      """{"title": "short but not too short", "subtitle": "just right!", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article","contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+    }
+  }
+
+  it should "handle empty subtitle" in {
+    FatcatScorable.jsonToMapFeatures(
+      """{"title": "short but not too short", "subtitle": "", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshort"
+    }
+  }
+
+  it should "handle null subtitle" in {
+    FatcatScorable.jsonToMapFeatures(
+      """{"title": "short but not too short", "subtitle": null, "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+      case None => fail()
+      case Some(result) => result.slug shouldBe "shortbutnottooshort"
+    }
+  }
+
+  it should "handle missing authors" in {
+    // TODO: not actually removing these
+    //FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) should be (None)
+    FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors)
+  }
+
+  it should "handle valid input" in {
+    FatcatScorable.jsonToMapFeatures(FatcatStringWithGoodTitle) match {
+      case None => fail()
+      case Some(result) => {
+        result.slug shouldBe "sometitle"
+        Scorable.jsonToMap(result.json) match {
+          case None => fail()
+          case Some(map) => {
+            map("title").asInstanceOf[String] shouldBe "Some Title"
+            //map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+            map("fatcat_release").asInstanceOf[String] shouldBe "pnri57u66ffytigdmyybbmouni"
+            map("fatcat_work").asInstanceOf[String] shouldBe "tdmqnfzm2nggrhfwzasyegvpyu"
+            // TODO: full name? not just a string?
+            map("authors").asInstanceOf[List[String]] shouldBe List("W Gaier")
+            map("year").asInstanceOf[Double].toInt shouldBe 1996
+          }
+        }
+      }
+    }
+  }
+
+  "FatcatScorable.keepRecord()" should "return true for valid JSON with title" in {
+    FatcatScorable.keepRecord(FatcatStringWithGoodTitle) shouldBe true
+  }
+
+  it should "return true for valid JSON with a title of maximum permitted length" in {
+    FatcatScorable.keepRecord(FatcatStringWithMaximumTitle) shouldBe true
+  }
+
+  it should "return false for valid JSON with excessively long title" in {
+    FatcatScorable.keepRecord(FatcatStringWithExcessiveTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with null title" in {
+    FatcatScorable.keepRecord(FatcatStringWithNullTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with no title" in {
+    FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+  }
+
+  it should "return false for invalid JSON" in {
+    FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+  }
+
+}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
new file mode 100644
index 0000000..bf9343b
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -0,0 +1,124 @@
+
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  // scalastyle:on
+  val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+  val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+  val MalformedJsonString = JsonString.replace("}", "")
+
+  //  Pipeline tests
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val Sha1Strings : List[String] = List(
+    "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",  // good
+    "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",  // good
+    "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",  // good
+    "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",  // bad status
+    "sha1:93187A85273589347598473894839443",  // malformed
+    "sha1:024937534094897039547e9824382943")  // bad status
+
+  val JsonStrings : List[String] = List(
+    JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+    JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+    JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+    MalformedJsonString,
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
+  )
+
+  // bnewbold: status codes aren't strings, they are uint64
+  val Ok : Long = 200
+  val Bad : Long = 400
+  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+
+  val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+    .zipped
+    .toList
+    .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+  // scalastyle:off null
+  // Add example of lines without GROBID data
+  val SampleData = SampleDataHead :+ new Tuple(
+    new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+  // scalastyle:on null
+
+  JobTest("sandcrawler.GrobidScorableDumpJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("debug", "true")
+    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+    .sink[(String, String)](TypedTsv[(String, String)](output)) {
+      outputBuffer =>
+      "The pipeline" should "return correct-length list" in {
+        outputBuffer should have length 3
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
new file mode 100644
index 0000000..b395a64
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -0,0 +1,122 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableTest extends FlatSpec with Matchers {
+  val GrobidString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+  val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+  val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
+  val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+  val MalformedGrobidString = GrobidString.replace("}", "")
+  val Key = "Dummy Key"
+
+  // Unit tests
+
+  "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+    GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None)
+  }
+
+  it should "handle null title" in {
+    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None)
+  }
+
+  it should "handle missing title" in {
+    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None)
+  }
+
+  it should "handle valid input" in {
+    GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match {
+      case None => fail()
+      case Some(result) => {
+        result.slug shouldBe "dummyexamplefile"
+        Scorable.jsonToMap(result.json) match {
+          case None => fail()
+          case Some(map) => {
+            map should contain key "title"
+            map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+            map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+          }
+        }
+      }
+    }
+  }
+
+  "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
+    GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
+  }
+
+  it should "return true for valid JSON with a title of maximum permitted length" in {
+    GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true
+  }
+
+  it should "return false for valid JSON with excessively long title" in {
+    GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with null title" in {
+    GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with no title" in {
+    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+  }
+
+  it should "return false for invalid JSON" in {
+    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
index 603a4c7..c61cb22 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
@@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers {
     fields should have length 0
   }
 
+  //scalastyle:off no.whitespace.before.left.bracket
   it should "throw IllegalArgumentException on malformed input" in {
     a [IllegalArgumentException] should be thrownBy {
       HBaseBuilder.parseColSpecs(List("file_size"))
diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
index fde2290..d6d283f 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
index 3424a36..c4ca5aa 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 /**
@@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
       outputBuffer =>
 
         it("should return the test data provided.") {
-          println("outputBuffer.size => " + outputBuffer.size)
           assert(outputBuffer.size === 1)
         }
 
         it("should return the correct count") {
-          println("raw output => " + outputBuffer)
           assert(outputBuffer(0).getObject(0) === 8)
         }
     }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
new file mode 100644
index 0000000..d2cf9de
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
@@ -0,0 +1,71 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
+import scala._
+
+@RunWith(classOf[JUnitRunner])
+class HBaseStatusCodeCountTest extends FunSpec with TupleConversions {
+
+  val output = "/tmp/testOutput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val log = LoggerFactory.getLogger(this.getClass.getName)
+
+  val statusType1 : Long = 200
+  val statusType2 : Long = 404
+  val statusType1Bytes = Bytes.toBytes(statusType1)
+  val statusType2Bytes = Bytes.toBytes(statusType2)
+
+  // TODO(bnewbold): now to express a null (empty value) in this list?
+    val sampleData : List[List[Array[Byte]]] = List(
+      ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+      ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+      ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+      ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+      ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+      ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+      ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+      ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+        .map(pair => List(Bytes.toBytes(pair._1), pair._2))
+
+  val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
+  val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
+
+  JobTest("sandcrawler.HBaseStatusCodeCountJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("debug", "true")
+    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+      sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+      outputBuffer =>
+      it("should return a correct number of elements.") {
+        assert(outputBuffer.size === 2)
+      }
+
+      // Convert List[Tuple] to Map[Long, Long].
+      val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
+      it("should have the appropriate number of each status type") {
+        assert(counts(statusType1) == statusType1Count)
+        assert(counts(statusType2) == statusType2Count)
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index d7689cd..7e91af3 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -1,15 +1,19 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
@@ -20,21 +24,20 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
 
   val log = LoggerFactory.getLogger(this.getClass.getName)
 
-  val statusType1 : Long = 200
-  val statusType2 : Long = 404
-  val statusType1Bytes = Bytes.toBytes(statusType1)
-  val statusType2Bytes = Bytes.toBytes(statusType2)
-
-  val sampleData = List(
-    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), statusType1Bytes),
-    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), statusType1Bytes),
-    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), statusType2Bytes),
-    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), statusType2Bytes),
-    List(Bytes.toBytes("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ"), statusType2Bytes),
-    List(Bytes.toBytes("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6"), statusType2Bytes),
-    List(Bytes.toBytes("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ"), statusType1Bytes),
-    List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), statusType2Bytes)
-  )
+  val statusType1Bytes = Bytes.toBytes("""{"status": "success"}""")
+  val statusType2Bytes = Bytes.toBytes("""{"status": "partial"}""")
+
+  // TODO(bnewbold): now to express a null (empty value) in this list?
+    val sampleData : List[List[Array[Byte]]] = List(
+      ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+      ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+      ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+      ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+      ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+      ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+      ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+      ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+        .map(pair => List(Bytes.toBytes(pair._1), pair._2))
 
   val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
   val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
@@ -46,20 +49,13 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
     .arg("hbase-table", testTable)
     .arg("zookeeper-hosts", testHost)
     .arg("debug", "true")
-    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status"),
       sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
-    .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+    .sink[Tuple](TypedTsv[(String, Long)](output)) {
       outputBuffer =>
       it("should return a 2-element list.") {
         assert(outputBuffer.size === 2)
       }
-
-      // Convert List[Tuple] to Map[Long, Long].
-      val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
-      it("should have the appropriate number of each status type") {
-        assert(counts(statusType1) == statusType1Count)
-        assert(counts(statusType2) == statusType2Count)
-      }
     }
     .run
     .finish
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
new file mode 100644
index 0000000..c847296
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -0,0 +1,64 @@
+package sandcrawler
+
+import java.io.InputStream
+
+import scala.io.Source
+
+import org.scalatest._
+
+// scalastyle:off null
+class ScorableFeaturesTest extends FlatSpec with Matchers {
+  "toMapFeatures()" should "work with gnarly inputs" in {
+    ScorableFeatures.create(title = null).toMapFeatures
+    ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
+  }
+
+  private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
+
+  "mapToSlug()" should "extract the parts of titles before a colon" in {
+    titleToSlug("HELLO:there") shouldBe Some("hellothere")
+  }
+
+  it should "extract an entire colon-less string" in {
+    titleToSlug("hello THERE") shouldBe Some("hellothere")
+  }
+
+  it should "return Scorable.NoSlug if given empty string" in {
+    titleToSlug("") shouldBe (None)
+  }
+
+  it should "return Scorable.NoSlug if given null" in {
+    titleToSlug(null) shouldBe (None)
+  }
+
+  it should "strip punctuation" in {
+    titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
+    titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
+    titleToSlug(
+      "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
+    titleToSlug(":;\"\'") shouldBe (None)
+  }
+
+  it should "filter stub titles" in {
+    titleToSlug("abstract") shouldBe (None)
+    titleToSlug("title!") shouldBe (None)
+    titleToSlug("a real title which is not on denylist") shouldBe Some("arealtitlewhichisnotondenylist")
+  }
+
+  it should "strip special characters" in {
+    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None)
+    // TODO: titleToSlug("©™₨№…") shouldBe (None)
+    // TODO: titleToSlug("πµΣσ") shouldBe (None)
+  }
+
+  it should "remove whitespace" in {
+    titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
+    titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
+    titleToSlug("\n \t \r  ") shouldBe (None)
+  }
+
+  it should "skip very short slugs" in {
+    titleToSlug("short") shouldBe (None)
+    titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
new file mode 100644
index 0000000..2094543
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -0,0 +1,81 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScorableTest extends FlatSpec with Matchers {
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  "jsonToMap()" should "return a map, given a legal JSON string" in {
+    Scorable.jsonToMap(JsonString) should not be (None)
+  }
+
+  it should "return None, given illegal JSON" in {
+    Scorable.jsonToMap("illegal{,json{{") should be (None)
+  }
+
+  "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+    val score = Scorable.computeSimilarity(
+      new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+    score shouldBe Scorable.MaxScore
+  }
+
+  "computeOutput()" should "be case-insensitive" in {
+    val left = JsonString.replace("<<TITLE>>", "A TITLE UPPER CASE")
+    val right = JsonString.replace("<<TITLE>>", "a title upper case")
+    val score = Scorable.computeSimilarity(
+      new ReduceFeatures(left), new ReduceFeatures(right))
+    score shouldBe Scorable.MaxScore
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala
new file mode 100644
index 0000000..5393f10
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreInsertableJobTest.scala
@@ -0,0 +1,262 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreInsertableJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  // scalastyle:on
+  val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+  val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+  val MalformedJsonString = JsonString.replace("}", "")
+
+  // scalastyle:off
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml",
+               "content-version" : "vor",
+               "intended-application" : "text-mining" },
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain",
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+  // scalastyle:on
+  val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y"  // arbitrary long string
+  val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+  val CrossrefStrings = List(
+    CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+    CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
+
+  //  Pipeline tests
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val Sha1Strings : List[String] = List(
+    "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",
+    "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",
+    "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
+    "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
+    "sha1:93187A85273589347598473894839443",
+    "sha1:024937534094897039547e9824382943",
+    "sha1:93229759932857982837892347893892",
+    "sha1:83229759932857982837892347893892")
+
+  val JsonStrings : List[String] = List(
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+    JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+    JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+    MalformedJsonString,
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+    // These are in both sources but have bad titles
+    JsonString.replace("<<TITLE>>", TooLongOfTitle),
+    JsonString.replace("<<TITLE>>", TooShortOfTitle)
+  )
+
+  // bnewbold: status codes aren't strings, they are uint64
+  val Ok : Long = 200
+  val Bad : Long = 400
+  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
+
+  val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+    .zipped
+    .toList
+    .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+  // scalastyle:off null
+  // Add example of lines without GROBID data
+  // scalastyle:off null
+  val SampleData = SampleDataHead :+ new Tuple(
+    new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+  // scalastyle:on null
+
+  val CdxList: List[String] = List("{}", "{}",  "{}",  "{}",  "{}",  "{}",  "{}",  "{}" )
+  val MimeList: List[String] = List("application/pdf", "application/pdf", "application/pdf",
+    "application/pdf", "application/pdf", "application/pdf", "application/pdf",
+    "application/pdf")
+  val SizeList: List[Long] = List(1,2,3,4,5,6,7,8)
+
+  // Can zip 3 lists, but not 4... so we recursively zip
+  val SampleCdxData : List[Tuple] = ((Sha1Strings, CdxList).zipped.toList, (MimeList, SizeList).zipped.toList)
+    .zipped
+    .toList
+    .map { case ((sha: String, cdx: String), (mime: String, size: Long)) => List(Bytes.toBytes(sha), Bytes.toBytes(cdx), Bytes.toBytes(mime), Bytes.toBytes(size)) }
+    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+  JobTest("sandcrawler.ScoreInsertableJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+    .source[Tuple](ScoreInsertableJob.getHBaseCdxSource(testTable, testHost), SampleCdxData)
+    .source(TextLine(input), List(
+      0 -> CrossrefStrings(0),
+      1 -> CrossrefStrings(1),
+      2 -> CrossrefStrings(2),
+      3 -> CrossrefStrings(3),
+      4 -> CrossrefStrings(4),
+      4 -> CrossrefStrings(5)))
+    .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
+    .sink[(String, String, Int, String, String, String, String, Long)](TypedTsv[(String, String, Int, String, String, String, String, Long)](output)) {
+      // Grobid titles and slugs (in parentheses):
+      //   Title 1                       (title1)
+      //   Title 2: TNG                  (title2tng)
+      //   Title 3: The Sequel           (title3thesequel)
+      //   <too long of a title>
+      //   <too short of a title>
+      // crossref titles and slugs (in parentheses):
+      //   Title 2: TNG                  (title2tng)
+      //   Title 1: TNG 2A               (title1tng2a)
+      //   Title 1: TNG 3                (title1tng3)
+      //   Title 2: Rebooted             (title2rebooted)
+      //   <too long of a title>
+      //   <too short of a title>
+      // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
+      outputBuffer =>
+      "The pipeline" should "return a 1-element list" in {
+        outputBuffer should have length 1
+      }
+
+      it should "has right # of entries with each slug" in {
+        val slugs = outputBuffer.map(_._2)
+        val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
+        // XXX: countMap("title1") shouldBe 3
+        countMap("title2tng") shouldBe 1
+      }
+
+      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
+        val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures(
+          Sha1Strings(grobidIndex),
+          JsonStrings(grobidIndex))
+        val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex))
+        if (mfg.isEmpty || mfc.isEmpty) {
+          fail()
+        } else {
+          val score = Scorable.computeSimilarity(
+            ReduceFeatures(mfg.get.json),
+            ReduceFeatures(mfc.get.json))
+          (slug, score, mfg.get.json, mfc.get.json)
+        }
+      }
+
+      it should "have right output values" in {
+        //outputBuffer.exists(_ == bundle("title1", 0, 0))
+        //outputBuffer.exists(_ == bundle("title1", 0, 2))
+        //outputBuffer.exists(_ == bundle("title1", 0, 1))
+        outputBuffer.exists(_ == bundle("title2tng", 1, 3))
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
new file mode 100644
index 0000000..fbc0ee5
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -0,0 +1,248 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  // scalastyle:on
+  val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+  val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+  val MalformedJsonString = JsonString.replace("}", "")
+
+  // scalastyle:off
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml",
+               "content-version" : "vor",
+               "intended-application" : "text-mining" },
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain",
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+  // scalastyle:on
+  val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y"  // arbitrary long string
+  val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+  val CrossrefStrings = List(
+    CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+    CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
+
+  //  Pipeline tests
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val Sha1Strings : List[String] = List(
+    "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",
+    "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",
+    "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
+    "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
+    "sha1:93187A85273589347598473894839443",
+    "sha1:024937534094897039547e9824382943",
+    "sha1:93229759932857982837892347893892",
+    "sha1:83229759932857982837892347893892")
+
+  val JsonStrings : List[String] = List(
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+    JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+    JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
+    MalformedJsonString,
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+    // These are in both sources but have bad titles
+    JsonString.replace("<<TITLE>>", TooLongOfTitle),
+    JsonString.replace("<<TITLE>>", TooShortOfTitle)
+  )
+
+  // bnewbold: status codes aren't strings, they are uint64
+  val Ok : Long = 200
+  val Bad : Long = 400
+  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
+
+  val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+    .zipped
+    .toList
+    .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+  // scalastyle:off null
+  // Add example of lines without GROBID data
+  // scalastyle:off null
+  val SampleData = SampleDataHead :+ new Tuple(
+    new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+  // scalastyle:on null
+
+  JobTest("sandcrawler.ScoreJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+    .source(TextLine(input), List(
+      0 -> CrossrefStrings(0),
+      1 -> CrossrefStrings(1),
+      2 -> CrossrefStrings(2),
+      3 -> CrossrefStrings(3),
+      4 -> CrossrefStrings(4),
+      4 -> CrossrefStrings(5)))
+    .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
+    .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
+      // Grobid titles and slugs (in parentheses):
+      //   Title 1                       (title1)
+      //   Title 2: TNG                  (title2tng)
+      //   Title 3: The Sequel           (title3thesequel)
+      //   <too long of a title>
+      //   <too short of a title>
+      // crossref titles and slugs (in parentheses):
+      //   Title 2: TNG                  (title2tng)
+      //   Title 1: TNG 2A               (title1tng2a)
+      //   Title 1: TNG 3                (title1tng3)
+      //   Title 2: Rebooted             (title2rebooted)
+      //   <too long of a title>
+      //   <too short of a title>
+      // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
+      outputBuffer =>
+      "The pipeline" should "return a 1-element list" in {
+        outputBuffer should have length 1
+      }
+
+      it should "has right # of entries with each slug" in {
+        val slugs = outputBuffer.map(_._1)
+        val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
+        // XXX: countMap("title1") shouldBe 3
+        countMap("title2tng") shouldBe 1
+      }
+
+      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
+        val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures(
+          Sha1Strings(grobidIndex),
+          JsonStrings(grobidIndex))
+        val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex))
+        if (mfg.isEmpty || mfc.isEmpty) {
+          fail()
+        } else {
+          val score = Scorable.computeSimilarity(
+            ReduceFeatures(mfg.get.json),
+            ReduceFeatures(mfc.get.json))
+          (slug, score, mfg.get.json, mfc.get.json)
+        }
+      }
+
+      it should "have right output values" in {
+        //outputBuffer.exists(_ == bundle("title1", 0, 0))
+        //outputBuffer.exists(_ == bundle("title1", 0, 2))
+        //outputBuffer.exists(_ == bundle("title1", 0, 1))
+        outputBuffer.exists(_ == bundle("title2tng", 1, 3))
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
new file mode 100644
index 0000000..410819b
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -0,0 +1,85 @@
+package sandcrawler
+
+import org.scalatest._
+
+class StringUtilitiesTest extends FlatSpec with Matchers {
+  "removeAccents()" should "handle the empty string" in {
+    StringUtilities.removeAccents("") shouldBe ""
+  }
+
+  it should "not change a string with unaccented characters" in {
+    StringUtilities.removeAccents("abc123") shouldBe "abc123"
+  }
+
+  it should "remove accents from Ls" in {
+    StringUtilities.removeAccents("E\u0141\u0142en") shouldBe "ELlen"
+  }
+
+  it should "remove accents from Es without changing case" in {
+    val result = StringUtilities.removeAccents("\u00e9")
+    result should have length 1
+    result shouldBe "e"
+  }
+
+  it should "convert the ø in Soren" in {
+    StringUtilities.removeAccents("Søren") shouldBe "Soren"
+    StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
+  }
+
+  "removePunctuation" should "work on the empty string" in {
+    StringUtilities.removePunctuation("") shouldBe ""
+  }
+
+  it should "work on non-empty text strings" in {
+    StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+    StringUtilities.removePunctuation(":-)") shouldBe ""
+    StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+  }
+
+  // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+  "stringDistance" should "work on empty strings" in {
+    StringUtilities.stringDistance("", "") shouldBe 0
+    StringUtilities.stringDistance("a", "") shouldBe 1
+    StringUtilities.stringDistance("", "a") shouldBe 1
+    StringUtilities.stringDistance("abc", "") shouldBe 3
+    StringUtilities.stringDistance("", "abc") shouldBe 3
+  }
+
+  it should "work on equal strings" in {
+    StringUtilities.stringDistance("", "") shouldBe 0
+    StringUtilities.stringDistance("a", "a") shouldBe 0
+    StringUtilities.stringDistance("abc", "abc") shouldBe 0
+  }
+
+  it should "work where only inserts are needed" in {
+    StringUtilities.stringDistance("", "a") shouldBe 1
+    StringUtilities.stringDistance("a", "ab") shouldBe 1
+    StringUtilities.stringDistance("b", "ab") shouldBe 1
+    StringUtilities.stringDistance("ac", "abc") shouldBe 1
+    StringUtilities.stringDistance("abcdefg", "xabxcdxxefxgx") shouldBe 6
+  }
+
+  it should "work where only deletes are needed" in {
+    StringUtilities.stringDistance( "a", "") shouldBe 1
+    StringUtilities.stringDistance( "ab", "a") shouldBe 1
+    StringUtilities.stringDistance( "ab", "b") shouldBe 1
+    StringUtilities.stringDistance("abc", "ac") shouldBe 1
+    StringUtilities.stringDistance("xabxcdxxefxgx", "abcdefg") shouldBe 6
+  }
+
+  it should "work where only substitutions are needed" in {
+    StringUtilities.stringDistance(  "a",   "b") shouldBe 1
+    StringUtilities.stringDistance( "ab",  "ac") shouldBe 1
+    StringUtilities.stringDistance( "ac",  "bc") shouldBe 1
+    StringUtilities.stringDistance("abc", "axc") shouldBe 1
+    StringUtilities.stringDistance("xabxcdxxefxgx", "1ab2cd34ef5g6") shouldBe 6
+  }
+
+  it should "work where many operations are needed" in {
+    StringUtilities.stringDistance("example", "samples") shouldBe 3
+    StringUtilities.stringDistance("sturgeon", "urgently") shouldBe 6
+    StringUtilities.stringDistance("levenshtein", "frankenstein") shouldBe 6
+    StringUtilities.stringDistance("distance", "difference") shouldBe 5
+    StringUtilities.stringDistance("java was neat", "scala is great") shouldBe 7
+  }
+}