7 files changed, 319 insertions, 44 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1789d1a..f598cae 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -64,12 +64,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
   "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
-"""
+""".replace("<<DOI>>", "10.123/aBc")
   // scalastyle:on
-  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+  val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+  val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+  val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+  val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
   val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
+  val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+  val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+  val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
 
   // Unit tests
   "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
@@ -82,19 +88,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
     result.slug shouldBe Scorable.NoSlug
   }
 
+  it should "handle null title" in {
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
+    result.slug shouldBe Scorable.NoSlug
+  }
+
   it should "handle empty title" in {
     val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
     result.slug shouldBe Scorable.NoSlug
   }
 
+  it should "handle missing authors" in {
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
+    result.slug shouldBe Scorable.NoSlug
+  }
+
   it should "handle valid input" in {
-    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
     result.slug shouldBe "sometitle"
     Scorable.jsonToMap(result.json) match {
       case None => fail()
       case Some(map) => {
         map("title").asInstanceOf[String] shouldBe "Some Title"
+        map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+        // TODO: full name? not just a string?
+        map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+        map("year").asInstanceOf[Double].toInt shouldBe 2002
       }
     }
   }
+
+  "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+  }
+
+  it should "return true for valid JSON with a title of maximum permitted length" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+  }
+
+  it should "return false for valid JSON with excessively long title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with null title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with no title" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+  }
+
+  it should "return false for invalid JSON" in {
+    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+  }
+
+  it should "handle content types" in {
+    val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
+    resultWrong.slug shouldBe Scorable.NoSlug
+    val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
+    resultMissing.slug shouldBe Scorable.NoSlug
+  }
 }
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
new file mode 100644
index 0000000..bf9343b
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -0,0 +1,124 @@
+
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  // scalastyle:on
+  val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
+  val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
+  val MalformedJsonString = JsonString.replace("}", "")
+
+  //  Pipeline tests
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val Sha1Strings : List[String] = List(
+    "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",  // good
+    "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",  // good
+    "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",  // good
+    "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",  // bad status
+    "sha1:93187A85273589347598473894839443",  // malformed
+    "sha1:024937534094897039547e9824382943")  // bad status
+
+  val JsonStrings : List[String] = List(
+    JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+    JsonString.replace("<<TITLE>>", "Title 2: TNG"),
+    JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
+    MalformedJsonString,
+    // This will have bad status.
+    JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
+  )
+
+  // bnewbold: status codes aren't strings, they are uint64
+  val Ok : Long = 200
+  val Bad : Long = 400
+  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+
+  val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
+    .zipped
+    .toList
+    .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
+    .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+
+  // scalastyle:off null
+  // Add example of lines without GROBID data
+  val SampleData = SampleDataHead :+ new Tuple(
+    new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+  // scalastyle:on null
+
+  JobTest("sandcrawler.GrobidScorableDumpJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("debug", "true")
+    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
+    .sink[(String, String)](TypedTsv[(String, String)](output)) {
+      outputBuffer =>
+      "The pipeline" should "return correct-length list" in {
+        outputBuffer should have length 3
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 661824b..119cf90 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,7 +57,10 @@ class GrobidScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
-  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+  val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+  val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
   val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
   val MalformedGrobidString = GrobidString.replace("}", "")
   val Key = "Dummy Key"
@@ -69,20 +72,50 @@ class GrobidScorableTest extends FlatSpec with Matchers {
     result.slug shouldBe Scorable.NoSlug
   }
 
+  it should "handle null title" in {
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
+    result.slug shouldBe Scorable.NoSlug
+  }
+
   it should "handle missing title" in {
     val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
     result.slug shouldBe Scorable.NoSlug
   }
 
   it should "handle valid input" in {
-    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
     result.slug shouldBe "dummyexamplefile"
     Scorable.jsonToMap(result.json) match {
       case None => fail()
       case Some(map) => {
         map should contain key "title"
         map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+        map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
       }
     }
   }
+
+  "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
+    GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
+  }
+
+  it should "return true for valid JSON with a title of maximum permitted length" in {
+    GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true
+  }
+
+  it should "return false for valid JSON with excessively long title" in {
+    GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with null title" in {
+    GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
+  }
+
+  it should "return false for valid JSON with no title" in {
+    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+  }
+
+  it should "return false for invalid JSON" in {
+    GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+  }
 }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
new file mode 100644
index 0000000..d2cf9de
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCodeCountTest.scala
@@ -0,0 +1,71 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.junit.runner.RunWith
+import org.scalatest.FunSpec
+import org.scalatest.junit.JUnitRunner
+import org.slf4j.LoggerFactory
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
+import scala._
+
+@RunWith(classOf[JUnitRunner])
+class HBaseStatusCodeCountTest extends FunSpec with TupleConversions {
+
+  val output = "/tmp/testOutput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val log = LoggerFactory.getLogger(this.getClass.getName)
+
+  val statusType1 : Long = 200
+  val statusType2 : Long = 404
+  val statusType1Bytes = Bytes.toBytes(statusType1)
+  val statusType2Bytes = Bytes.toBytes(statusType2)
+
+  // TODO(bnewbold): now to express a null (empty value) in this list?
+    val sampleData : List[List[Array[Byte]]] = List(
+      ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+      ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+      ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+      ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+      ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+      ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+      ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+      ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+        .map(pair => List(Bytes.toBytes(pair._1), pair._2))
+
+  val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
+  val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
+
+  JobTest("sandcrawler.HBaseStatusCodeCountJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("debug", "true")
+    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+      sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+      outputBuffer =>
+      it("should return a correct number of elements.") {
+        assert(outputBuffer.size === 2)
+      }
+
+      // Convert List[Tuple] to Map[Long, Long].
+      val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
+      it("should have the appropriate number of each status type") {
+        assert(counts(statusType1) == statusType1Count)
+        assert(counts(statusType2) == statusType2Count)
+      }
+    }
+    .run
+    .finish
+}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index 3291670..7e91af3 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -24,10 +24,8 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
 
   val log = LoggerFactory.getLogger(this.getClass.getName)
 
-  val statusType1 : Long = 200
-  val statusType2 : Long = 404
-  val statusType1Bytes = Bytes.toBytes(statusType1)
-  val statusType2Bytes = Bytes.toBytes(statusType2)
+  val statusType1Bytes = Bytes.toBytes("""{"status": "success"}""")
+  val statusType2Bytes = Bytes.toBytes("""{"status": "partial"}""")
 
   // TODO(bnewbold): now to express a null (empty value) in this list?
     val sampleData : List[List[Array[Byte]]] = List(
@@ -51,20 +49,13 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
     .arg("hbase-table", testTable)
     .arg("zookeeper-hosts", testHost)
     .arg("debug", "true")
-    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status_code"),
+    .source[Tuple](HBaseCountJob.getHBaseSource(testTable, testHost, "grobid0:status"),
       sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
-    .sink[Tuple](TypedTsv[(Long, Long)](output)) {
+    .sink[Tuple](TypedTsv[(String, Long)](output)) {
       outputBuffer =>
       it("should return a 2-element list.") {
         assert(outputBuffer.size === 2)
       }
-
-      // Convert List[Tuple] to Map[Long, Long].
-      val counts = outputBuffer.map(t => (t.getLong(0), t.getLong(1))).toMap
-      it("should have the appropriate number of each status type") {
-        assert(counts(statusType1) == statusType1Count)
-        assert(counts(statusType2) == statusType2Count)
-      }
     }
     .run
     .finish
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 5a22ef8..474f69a 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -9,22 +9,6 @@ import org.scalatest._
 // scalastyle:off null
 class ScorableFeaturesTest extends FlatSpec with Matchers {
 
-  // TODO: Remove this when we're convinced that our file-reading code
-  // works. (I'm already convinced. --Ellen)
-  "read slugs" should "work" in {
-    val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
-      "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
-      "casereport", "commentary", "commentaryon", "commenton", "commentto",
-      "contents", "correspondence", "dedication", "editorialadvisoryboard",
-      "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
-      "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
-      "references", "results", "review", "reviewarticle", "summary", "title",
-      "name")
-
-    ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size
-    for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s)
-  }
-
   private def titleToSlug(s : String) : String = {
     ScorableFeatures.create(title = s).toSlug
   }
@@ -52,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
 
   it should "strip punctuation" in {
     titleToSlug("HELLO!:the:re") shouldBe "hellothere"
-    titleToSlug("a:b:c") shouldBe "abc"
+    titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
     titleToSlug(
       "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
     titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -65,14 +49,19 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
   }
 
   it should "strip special characters" in {
-    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+    titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug
     // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
     // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
   }
 
   it should "remove whitespace" in {
     titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
-    titleToSlug("\na\t:b:c") shouldBe "abc"
+    titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
     titleToSlug("\n \t \r  ") shouldBe Scorable.NoSlug
   }
+
+  it should "skip very short slugs" in {
+    titleToSlug("short") shouldBe Scorable.NoSlug
+    titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+  }
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 00e4659..32fb16c 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -117,6 +117,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
 }
 """
   // scalastyle:on
+  val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y"  // arbitrary long string
+  val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -124,7 +126,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
     CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
     CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
     CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
-    CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
+    CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+    CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
 
   //  Pipeline tests
   val output = "/tmp/testOutput"
@@ -137,23 +141,28 @@ class ScoreJobTest extends FlatSpec with Matchers {
     "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
     "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
     "sha1:93187A85273589347598473894839443",
-    "sha1:024937534094897039547e9824382943")
+    "sha1:024937534094897039547e9824382943",
+    "sha1:93229759932857982837892347893892",
+    "sha1:83229759932857982837892347893892")
 
   val JsonStrings : List[String] = List(
-    JsonString.replace("<<TITLE>>", "Title 1"),
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
     JsonString.replace("<<TITLE>>", "Title 2: TNG"),
     JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
     // This will have bad status.
-    JsonString.replace("<<TITLE>>", "Title 1"),
+    JsonString.replace("<<TITLE>>", "Title 1: The Original"),
     MalformedJsonString,
     // This will have bad status.
-    JsonString.replace("<<TITLE>>", "Title 2")
+    JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+    // These are in both sources but have bad titles
+    JsonString.replace("<<TITLE>>", TooLongOfTitle),
+    JsonString.replace("<<TITLE>>", TooShortOfTitle)
   )
 
   // bnewbold: status codes aren't strings, they are uint64
   val Ok : Long = 200
   val Bad : Long = 400
-  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+  val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
 
   val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
     .zipped
@@ -181,17 +190,24 @@ class ScoreJobTest extends FlatSpec with Matchers {
       0 -> CrossrefStrings(0),
       1 -> CrossrefStrings(1),
       2 -> CrossrefStrings(2),
-      3 -> CrossrefStrings(3)))
+      3 -> CrossrefStrings(3),
+      4 -> CrossrefStrings(4),
+      4 -> CrossrefStrings(5)))
+    .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
       // Grobid titles and slugs (in parentheses):
       //   Title 1                       (title1)
       //   Title 2: TNG                  (title2tng)
       //   Title 3: The Sequel           (title3thesequel)
+      //   <too long of a title>
+      //   <too short of a title>
       // crossref titles and slugs (in parentheses):
       //   Title 2: TNG                  (title2tng)
       //   Title 1: TNG 2A               (title1tng2a)
       //   Title 1: TNG 3                (title1tng3)
       //   Title 2: Rebooted             (title2rebooted)
+      //   <too long of a title>
+      //   <too short of a title>
       // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
       outputBuffer =>
       "The pipeline" should "return a 1-element list" in {