3 files changed, 15 insertions, 5 deletions
diff --git a/please b/please
index e328410..b32dd79 100755
--- a/please
+++ b/please
@@ -147,10 +147,15 @@ def run_matchcrossref(args):
         datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
     # Notes: -D options must come after Tool but before class name
     # https://github.com/twitter/scalding/wiki/Frequently-asked-questions#how-do-i-pass-parameters-to-my-hadoop-job-number-of-reducers--memory-options--etc-
+    # Compression: changed due to errors in production
+    # https://stackoverflow.com/a/11336820/4682349
     cmd = """hadoop jar \
         scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
         com.twitter.scalding.Tool \
         -Dmapred.reduce.tasks={reducers} \
+        -Dcascading.spill.list.threshold=500000 \
+        -D mapred.output.compress=false \
+        -Dmapred.compress.map.output=true\
         sandcrawler.ScoreJob \
         --hdfs \
         --app.conf.path scalding/ia_cluster.conf \
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 75d45e9..28e9132 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -13,15 +13,18 @@ class ScoreJob(args: Args) extends JobBase(args) {
   val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)
   val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args)
 
-  pipe1.join(pipe2).map { entry =>
-    val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
-    new ReduceOutput(
+  pipe1
+    .addTrap(TypedTsv(args("output") + ".trapped"))
+    .join(pipe2)
+    .map { entry =>
+      val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+      new ReduceOutput(
       slug,
       Scorable.computeSimilarity(features1, features2),
       features1.json,
       features2.json)
-  }
-  //TypedTsv doesn't work over case classes.
+    }
+    //TypedTsv doesn't work over case classes.
     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
     .write(TypedTsv[(String, Int, String, String)](args("output")))
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 00e4659..35c31e5 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -182,6 +182,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
       1 -> CrossrefStrings(1),
       2 -> CrossrefStrings(2),
       3 -> CrossrefStrings(3)))
+    .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) {
+        _ => () }
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
       // Grobid titles and slugs (in parentheses):
       //   Title 1                       (title1)