From cbd6433af7949df7c4433468bf99eefe9973e864 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 10:11:54 -0700
Subject: Removed commented-out code.

---
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 108 +++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 scalding/src/test/scala/sandcrawler/ScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
new file mode 100644
index 0000000..0375b6a
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -0,0 +1,108 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
+  val JsonString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  val MalformedJsonString = JsonString.replace("}", "")
+
+  "titleToSlug()" should "extract the parts of titles before a colon" in {
+    val slug = Scorable.titleToSlug("HELLO:there")
+    slug should contain ("hello")
+  }
+
+  it should "extract an entire colon-less string" in {
+    val slug = Scorable.titleToSlug("hello THERE")
+    slug should contain ("hello there")
+  }
+
+  it should "return None if given empty string" in {
+    Scorable.titleToSlug("") shouldBe None
+  }
+
+  "jsonToMap()" should "return a map, given a legal JSON string" in {
+    Scorable.jsonToMap(jsonString) should be (Some(_))
+  }
+
+  it should "return None, given illegal JSON" in {
+    Scorable.jsonToMap("illegal{,json{{") should be (None))
+  }
+
+/*
+  it should "return None if given a malformed json string" in {
+    val slug = Scorable.grobidToSlug(MalformedGrobidString)
+    slug shouldBe None
+  }
+
+  it should "return None if given an empty json string" in {
+    val slug = Scorable.grobidToSlug("")
+    slug shouldBe None
+  }
+
+  "crossrefToSlug()" should "get the right slug for a crossref json string" in {
+    val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle)
+    slug should contain ("sometitle")
+  }
+
+  it should "return None if given json string without title" in {
+    val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle)
+    slug shouldBe None
+  }
+
+  it should "return None if given a malformed json string" in {
+    val slug = Scorable.grobidToSlug(MalformedCrossrefString)
+    slug shouldBe None
+  }
+ */
+}
+  
-- 
cgit v1.2.3


From 6cdea0ec0950c8f12c362b6521a1bbbabc3db379 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 10:12:12 -0700
Subject: Added ScorableTest, which passes.

---
 scalding/src/test/scala/sandcrawler/ScorableTest.scala | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 0375b6a..78cd358 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -8,7 +8,7 @@ import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
-class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
+class ScorableTest extends FlatSpec with Matchers {
   val JsonString = """
 {
   "title": "<<TITLE>>",
@@ -58,24 +58,24 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
 
   "titleToSlug()" should "extract the parts of titles before a colon" in {
     val slug = Scorable.titleToSlug("HELLO:there")
-    slug should contain ("hello")
+    slug shouldBe "hello"
   }
 
   it should "extract an entire colon-less string" in {
     val slug = Scorable.titleToSlug("hello THERE")
-    slug should contain ("hello there")
+    slug shouldBe "hello there"
   }
 
-  it should "return None if given empty string" in {
-    Scorable.titleToSlug("") shouldBe None
+  it should "return Scorable.NoSlug if given empty string" in {
+    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
   }
 
   "jsonToMap()" should "return a map, given a legal JSON string" in {
-    Scorable.jsonToMap(jsonString) should be (Some(_))
+    Scorable.jsonToMap(JsonString) should not be (None)
   }
 
   it should "return None, given illegal JSON" in {
-    Scorable.jsonToMap("illegal{,json{{") should be (None))
+    Scorable.jsonToMap("illegal{,json{{") should be (None)
   }
 
 /*
-- 
cgit v1.2.3


From dddb7ed410bdd542ca12756d3e97aca6beea5532 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 10:22:02 -0700
Subject: Added test, which passes.

---
 scalding/src/test/scala/sandcrawler/ScorableTest.scala | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 78cd358..535b8f6 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -57,13 +57,11 @@ class ScorableTest extends FlatSpec with Matchers {
   val MalformedJsonString = JsonString.replace("}", "")
 
   "titleToSlug()" should "extract the parts of titles before a colon" in {
-    val slug = Scorable.titleToSlug("HELLO:there")
-    slug shouldBe "hello"
+    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
   }
 
   it should "extract an entire colon-less string" in {
-    val slug = Scorable.titleToSlug("hello THERE")
-    slug shouldBe "hello there"
+    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
   }
 
   it should "return Scorable.NoSlug if given empty string" in {
@@ -78,7 +76,12 @@ class ScorableTest extends FlatSpec with Matchers {
     Scorable.jsonToMap("illegal{,json{{") should be (None)
   }
 
-/*
+  "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { 
+    val output = Scorable.computeOutput(new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+    output.score shouldBe Scorable.MaxScore
+  }
+
+  /*
   it should "return None if given a malformed json string" in {
     val slug = Scorable.grobidToSlug(MalformedGrobidString)
     slug shouldBe None
-- 
cgit v1.2.3


From 4981a98358aae098714d2266404f7b167993bf0c Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 10:28:48 -0700
Subject: Minor refactoring. Added test.

---
 scalding/src/main/scala/sandcrawler/Scorable.scala     | 15 ++++++---------
 scalding/src/main/scala/sandcrawler/ScoreJob.scala     |  4 +++-
 scalding/src/test/scala/sandcrawler/ScorableTest.scala |  5 +++--
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 948002b..77bb7ae 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -69,19 +69,16 @@ object Scorable {
 
   val MaxScore = 1000
 
-  def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) :
-      ReduceOutput = {
-    val json1 = jsonToMap(feature1.json)
-    val json2 = jsonToMap(feature2.json)
+  def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+    val json1 = jsonToMap(features1.json)
+    val json2 = jsonToMap(features2.json)
     getStringOption(json1, "title") match {
-      case None => ReduceOutput(0, "No title", feature1.json)
+      case None => 0
       case Some(title1) => {
         getStringOption(json2, "title") match {
-          case None => ReduceOutput(0, "No title", feature2.json)
+          case None => 0
           case Some(title2) => 
-            ReduceOutput(
-              (StringUtilities.similarity(title1, title2) * MaxScore).toInt,
-              feature1.json, feature2.json)
+            (StringUtilities.similarity(title1, title2) * MaxScore).toInt
         }
       }
     }
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 22cc9e9..e6a5dc1 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -17,7 +17,9 @@ class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : Fl
 
   pipe1.join(pipe2).map { entry =>
     val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
-    Scorable.computeOutput(features1, features2)
+    new ReduceOutput(Scorable.computeSimilarity(features1, features2),
+      features1.json,
+      features2.json)
   }
     .write(TypedTsv[ReduceOutput](args("output")))
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 535b8f6..9437fe6 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -77,8 +77,9 @@ class ScorableTest extends FlatSpec with Matchers {
   }
 
   "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { 
-    val output = Scorable.computeOutput(new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
-    output.score shouldBe Scorable.MaxScore
+    val score = Scorable.computeSimilarity(
+      new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+    score shouldBe Scorable.MaxScore
   }
 
   /*
-- 
cgit v1.2.3


From 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 11:05:23 -0700
Subject: Added GrobidScorableTest, minor improvements.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  |  19 +++-
 .../main/scala/sandcrawler/GrobidScorable.scala    |  24 +++--
 .../scala/sandcrawler/GrobidScorableTest.scala     |  77 ++++++++++++++
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 111 +++++++++++++--------
 4 files changed, 179 insertions(+), 52 deletions(-)
 create mode 100644 scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 0849aff..cf5849c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable {
       .read
       .toTypedPipe[String](new Fields("line"))
       .map{ json : String =>
-        HBaseCrossrefScore.crossrefToSlug(json) match {
+        CrossrefScorable.crossrefToSlug(json) match {
           case Some(slug) => new MapFeatures(slug, json)
           case None => new MapFeatures(Scorable.NoSlug, json)
         }
       }
   }
 }
+
+object CrossrefScorable {
+  def crossrefToSlug(json : String) : Option[String] = {
+    Scorable.jsonToMap(json) match {
+      case None => None
+      case Some(map) => {
+        if (map contains "title") {
+          // TODO: Don't ignore titles after the first.
+          val title = map("title").asInstanceOf[List[String]](0)
+          Some(Scorable.titleToSlug(title))
+        } else {
+          None
+        }
+      }
+    }
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 8da7708..25e5985 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
       }
     }
   }
-/*
-  def fromBytesWritableLocal(f: Fields): Pipe = {
-	asList(f)
-	  .foldLeft(pipe) { (p, fld) => {
-	    p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable =>
-            Option(from).map(x => Bytes.toString(x.get)).getOrElse(null)
-          }
-      }}
+}
+
+object GrobidScorable {
+  def grobidToSlug(json : String) : Option[String] = {
+    Scorable.jsonToMap(json) match {
+      case None => None
+      case Some(map) => {
+        if (map contains "title") {
+          Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+        } else {
+          None
+        }
+      }
+    }
   }
- */
 }
+
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
new file mode 100644
index 0000000..7777610
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -0,0 +1,77 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableTest extends FlatSpec with Matchers {
+  val GrobidString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+  val MalformedGrobidString = GrobidString.replace("}", "")
+
+  // Unit tests
+
+  "grobidToSlug()" should "get the right slug for a grobid json string" in {
+    val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle)
+    slug should contain ("dummy example file")
+  }
+
+  it should "return None if given json string without title" in {
+    val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle)
+    slug shouldBe None
+  }
+
+  it should "return None if given a malformed json string" in {
+    val slug = GrobidScorable.grobidToSlug(MalformedGrobidString)
+    slug shouldBe None
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 9437fe6..8445073 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class ScorableTest extends FlatSpec with Matchers {
-  val JsonString = """
+      val JsonString = """
 {
   "title": "<<TITLE>>",
   "authors": [
@@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
-  val MalformedJsonString = JsonString.replace("}", "")
 
-  "titleToSlug()" should "extract the parts of titles before a colon" in {
-    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
-  }
+  performUnitTests()
+  performPipelineTests()
 
-  it should "extract an entire colon-less string" in {
-    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
-  }
+  def performUnitTests() {
+    "titleToSlug()" should "extract the parts of titles before a colon" in {
+      Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+    }
 
-  it should "return Scorable.NoSlug if given empty string" in {
-    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
-  }
+    it should "extract an entire colon-less string" in {
+      Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+    }
 
-  "jsonToMap()" should "return a map, given a legal JSON string" in {
-    Scorable.jsonToMap(JsonString) should not be (None)
-  }
+    it should "return Scorable.NoSlug if given empty string" in {
+      Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+    }
 
-  it should "return None, given illegal JSON" in {
-    Scorable.jsonToMap("illegal{,json{{") should be (None)
-  }
+    "jsonToMap()" should "return a map, given a legal JSON string" in {
+      Scorable.jsonToMap(JsonString) should not be (None)
+    }
 
-  "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { 
-    val score = Scorable.computeSimilarity(
-      new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
-    score shouldBe Scorable.MaxScore
-  }
+    it should "return None, given illegal JSON" in {
+      Scorable.jsonToMap("illegal{,json{{") should be (None)
+    }
 
-  /*
-  it should "return None if given a malformed json string" in {
-    val slug = Scorable.grobidToSlug(MalformedGrobidString)
-    slug shouldBe None
+    "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+      val score = Scorable.computeSimilarity(
+        new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+      score shouldBe Scorable.MaxScore
+    }
   }
 
-  it should "return None if given an empty json string" in {
-    val slug = Scorable.grobidToSlug("")
-    slug shouldBe None
-  }
+  def performPipelineTests() {
+      /*
 
-  "crossrefToSlug()" should "get the right slug for a crossref json string" in {
-    val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle)
-    slug should contain ("sometitle")
-  }
+    val output = "/tmp/testOutput"
+    val input = "/tmp/testInput"
+    val (testTable, testHost) = ("test-table", "dummy-host:2181")
 
-  it should "return None if given json string without title" in {
-    val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle)
-    slug shouldBe None
-  }
+  val grobidSampleData = List(
+    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
+      Bytes.toBytes(MalformedGrobidString)))
 
-  it should "return None if given a malformed json string" in {
-    val slug = Scorable.grobidToSlug(MalformedCrossrefString)
-    slug shouldBe None
+  JobTest("sandcrawler.HBaseCrossrefScoreJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
+      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .source(TextLine(input), List(
+      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+    .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+    String, String, String, String)](output)) {
+      // Grobid titles: 
+      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+      // crossref slugs: 
+      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
+      outputBuffer =>
+      "The pipeline" should "return a 4-element list" in {
+        outputBuffer should have length 4
+      }
+    }
+    .run
+    .finish
+}
+       */
   }
- */
 }
   
-- 
cgit v1.2.3


From 71b8d527da73f99ffb1b09ec1044031e772d1db6 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 11:24:06 -0700
Subject: Added punctuation removal to slug creation and similarity comparisons

---
 scalding/src/main/scala/sandcrawler/Scorable.scala            |  3 ++-
 scalding/src/main/scala/sandcrawler/StringUtilities.scala     |  8 +++++++-
 scalding/src/test/scala/sandcrawler/ScorableTest.scala        |  7 +++++++
 scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala | 10 ++++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 77bb7ae..736c175 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -45,7 +45,8 @@ object Scorable {
   }
 
   def titleToSlug(title : String) : String = {
-    val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase()
+    val slug = StringUtilities.removePunctuation(
+      StringUtilities.removeAccents(title).split(":")(0).toLowerCase())
     if (slug.isEmpty) {
       NoSlug
     } else {
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 1ae6db3..3058f15 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -25,9 +25,15 @@ object StringUtilities {
     pattern.matcher(sb).replaceAll("")
   }
 
+  // Source: https://stackoverflow.com/a/30076541/631051
+  def removePunctuation(s: String) : String = {
+    s.replaceAll("""[\p{Punct}&&[^.]]""", "")
+  }
+
   // Adapted from: https://stackoverflow.com/a/16018452/631051
   def similarity(s1a : String, s2a : String) : Double = {
-    val (s1, s2) = (removeAccents(s1a), removeAccents(s2a))
+    val (s1, s2) = (removeAccents(removePunctuation(s1a)), 
+      removeAccents(removePunctuation(s2a)))
     val longer : String = if (s1.length > s2.length) s1 else s2
     val shorter : String = if (s1.length > s2.length) s2 else s1
     if (longer.length == 0) {
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 8445073..713a7e5 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -71,6 +71,13 @@ class ScorableTest extends FlatSpec with Matchers {
       Scorable.titleToSlug("") shouldBe Scorable.NoSlug
     }
 
+    "titleToSlug()" should "strip punctuation" in {
+      Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
+      Scorable.titleToSlug("a:b:c") shouldBe "a"
+      Scorable.titleToSlug(
+        "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+    }
+
     "jsonToMap()" should "return a map, given a legal JSON string" in {
       Scorable.jsonToMap(JsonString) should not be (None)
     }
diff --git a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
index 2df5a22..410819b 100644
--- a/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/StringUtilitiesTest.scala
@@ -26,6 +26,16 @@ class StringUtilitiesTest extends FlatSpec with Matchers {
     StringUtilities.removeAccents("SØREN") shouldBe "SOREN"
   }
 
+  "removePunctuation" should "work on the empty string" in {
+    StringUtilities.removePunctuation("") shouldBe ""
+  }
+
+  it should "work on non-empty text strings" in {
+    StringUtilities.removePunctuation("Hello, world!") shouldBe "Hello world"
+    StringUtilities.removePunctuation(":-)") shouldBe ""
+    StringUtilities.removePunctuation("<<---a---b--->") shouldBe "ab"
+  }
+
   // Tests adapted from https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
   "stringDistance" should "work on empty strings" in {
     StringUtilities.stringDistance("", "") shouldBe 0
-- 
cgit v1.2.3


From 6d64c5d4e1527c7277527132efa858def2589486 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Thu, 9 Aug 2018 11:30:44 -0700
Subject: Added test for null argument to titleToSlug()

---
 scalding/src/main/scala/sandcrawler/Scorable.scala     | 13 +++++++++----
 scalding/src/test/scala/sandcrawler/ScorableTest.scala |  4 ++++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 736c175..ce4fdca 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -45,12 +45,17 @@ object Scorable {
   }
 
   def titleToSlug(title : String) : String = {
-    val slug = StringUtilities.removePunctuation(
-      StringUtilities.removeAccents(title).split(":")(0).toLowerCase())
-    if (slug.isEmpty) {
+    if (title == null || title.isEmpty) {
       NoSlug
     } else {
-      slug
+      val unaccented = StringUtilities.removeAccents(title)
+      // Remove punctuation after splitting on colon.
+      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
+      if (slug.isEmpty || slug == null) {
+        NoSlug
+      } else {
+        slug
+      }
     }
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 713a7e5..40801a0 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -71,6 +71,10 @@ class ScorableTest extends FlatSpec with Matchers {
       Scorable.titleToSlug("") shouldBe Scorable.NoSlug
     }
 
+    it should "return Scorable.NoSlug if given null" in {
+      Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
+    }
+
     "titleToSlug()" should "strip punctuation" in {
       Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
       Scorable.titleToSlug("a:b:c") shouldBe "a"
-- 
cgit v1.2.3


From 9d7adc94ad63e85ffb2b459d4a8c2ed0ed46d8c8 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Thu, 9 Aug 2018 19:03:01 -0700
Subject: WIP

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  |   1 +
 .../main/scala/sandcrawler/GrobidScorable.scala    |  15 +-
 scalding/src/main/scala/sandcrawler/Scorable.scala |   2 +-
 scalding/src/main/scala/sandcrawler/ScoreJob.scala |  46 ++++--
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 112 ++++---------
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  | 177 +++++++++++++++++++++
 6 files changed, 251 insertions(+), 102 deletions(-)
 create mode 100644 scalding/src/test/scala/sandcrawler/ScoreJobTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index ee4cc54..d5da845 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -11,6 +11,7 @@ import parallelai.spyglass.hbase.HBaseSource
 
 class CrossrefScorable extends Scorable {
   def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {
+    // TODO: Generalize args so there can be multiple Grobid pipes in one job.
     TextLine(args("crossref-input"))
       .read
       .toTypedPipe[String](new Fields("line"))
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 95d6dae..4c67074 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -11,14 +11,9 @@ import parallelai.spyglass.hbase.HBaseSource
 
 class GrobidScorable extends Scorable with HBasePipeConversions {
   def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {
-    // TODO: Clean up code after debugging.
-    val grobidSource = HBaseBuilder.build(
-      args("hbase-table"),
-      args("zookeeper-hosts"),
-      List("grobid0:tei_json"),
-      SourceMode.SCAN_ALL)
-
-    grobidSource.read
+    // TODO: Generalize args so there can be multiple grobid pipes in one job.
+    GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+      .read
       .fromBytesWritable(new Fields("key", "tei_json"))
     // TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala)
     // didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json)
@@ -34,6 +29,10 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
 }
 
 object GrobidScorable {
+  def getHBaseSource(table : String, host : String) : HBaseSource = {
+    HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL)
+  }
+
   def grobidToSlug(json : String) : Option[String] = {
     Scorable.jsonToMap(json) match {
       case None => None
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 86336cb..cfdc192 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -9,7 +9,7 @@ import com.twitter.scalding.typed.TDsl._
 
 case class MapFeatures(slug : String, json : String)
 case class ReduceFeatures(json : String)
-case class ReduceOutput(val score : Int, json1 : String, json2 : String)
+case class ReduceOutput(val slug : String,  score : Int, json1 : String, json2 : String)
 
 abstract class Scorable {
   def getInputPipe(args : Args, flowDef : FlowDef, mode : Mode) : TypedPipe[(String, ReduceFeatures)] =
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index e6a5dc1..aa20d0f 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -1,25 +1,53 @@
 package sandcrawler
 
-import java.text.Normalizer
-
-import scala.math
-import scala.util.parsing.json.JSON
-
 import cascading.flow.FlowDef
 import com.twitter.scalding._
 import com.twitter.scalding.typed.TDsl._
 import parallelai.spyglass.base.JobBase
 import parallelai.spyglass.hbase.HBasePipeConversions
 
-class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : FlowDef, mode: Mode) extends JobBase(args) with HBasePipeConversions {
-  val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args, flowDef, mode)
-  val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args, flowDef, mode)
+class ScoreJob(args: Args)(implicit flowDef : FlowDef, mode: Mode) extends JobBase(args) with
+    HBasePipeConversions {
+  /*
+  val pipe1 : TypedPipe[(String, ReduceFeatures)] = ScoreJob.getScorable1().getInputPipe(args, flowDef, mode)
+  val pipe2 : TypedPipe[(String, ReduceFeatures)] = ScoreJob.getScorable2().getInputPipe(args, flowDef, mode)
 
   pipe1.join(pipe2).map { entry =>
     val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
-    new ReduceOutput(Scorable.computeSimilarity(features1, features2),
+    new ReduceOutput(
+      slug,
+      Scorable.computeSimilarity(features1, features2),
       features1.json,
       features2.json)
   }
     .write(TypedTsv[ReduceOutput](args("output")))
+   */
+}
+
+// Ugly hack to get non-String information into ScoreJob above.
+object ScoreJob {
+  var scorable1 : Option[Scorable] = None
+  var scorable2 : Option[Scorable] = None
+
+  def setScorable1(s : Scorable) {
+    scorable1 = Some(s)
+  }
+
+  def getScorable1() : Scorable = {
+    scorable1  match {
+      case Some(s) => s
+      case None => null
+    }
+  }
+
+  def setScorable2(s: Scorable) {
+    scorable2 = Some(s)
+  }
+
+  def getScorable2() : Scorable = {
+    scorable2 match {
+      case Some(s) => s
+      case None => null
+    }
+  }
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 40801a0..2f80492 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class ScorableTest extends FlatSpec with Matchers {
-      val JsonString = """
+  val JsonString = """
 {
   "title": "<<TITLE>>",
   "authors": [
@@ -55,96 +55,40 @@ class ScorableTest extends FlatSpec with Matchers {
 }
 """
 
-  performUnitTests()
-  performPipelineTests()
-
-  def performUnitTests() {
-    "titleToSlug()" should "extract the parts of titles before a colon" in {
-      Scorable.titleToSlug("HELLO:there") shouldBe "hello"
-    }
-
-    it should "extract an entire colon-less string" in {
-      Scorable.titleToSlug("hello THERE") shouldBe "hello there"
-    }
-
-    it should "return Scorable.NoSlug if given empty string" in {
-      Scorable.titleToSlug("") shouldBe Scorable.NoSlug
-    }
-
-    it should "return Scorable.NoSlug if given null" in {
-      Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
-    }
-
-    "titleToSlug()" should "strip punctuation" in {
-      Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
-      Scorable.titleToSlug("a:b:c") shouldBe "a"
-      Scorable.titleToSlug(
-        "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
-    }
+  "titleToSlug()" should "extract the parts of titles before a colon" in {
+    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+  }
 
-    "jsonToMap()" should "return a map, given a legal JSON string" in {
-      Scorable.jsonToMap(JsonString) should not be (None)
-    }
+  it should "extract an entire colon-less string" in {
+    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+  }
 
-    it should "return None, given illegal JSON" in {
-      Scorable.jsonToMap("illegal{,json{{") should be (None)
-    }
+  it should "return Scorable.NoSlug if given empty string" in {
+    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+  }
 
-    "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
-      val score = Scorable.computeSimilarity(
-        new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
-      score shouldBe Scorable.MaxScore
-    }
+  it should "return Scorable.NoSlug if given null" in {
+    Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
   }
 
-  def performPipelineTests() {
-      /*
+  "titleToSlug()" should "strip punctuation" in {
+    Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
+    Scorable.titleToSlug("a:b:c") shouldBe "a"
+    Scorable.titleToSlug(
+      "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+  }
 
-    val output = "/tmp/testOutput"
-    val input = "/tmp/testInput"
-    val (testTable, testHost) = ("test-table", "dummy-host:2181")
+  "jsonToMap()" should "return a map, given a legal JSON string" in {
+    Scorable.jsonToMap(JsonString) should not be (None)
+  }
 
-  val grobidSampleData = List(
-    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
-    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
-    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
-    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
-      Bytes.toBytes(MalformedGrobidString)))
+  it should "return None, given illegal JSON" in {
+    Scorable.jsonToMap("illegal{,json{{") should be (None)
+  }
 
-  JobTest("sandcrawler.HBaseCrossrefScoreJob")
-    .arg("test", "")
-    .arg("app.conf.path", "app.conf")
-    .arg("output", output)
-    .arg("hbase-table", testTable)
-    .arg("zookeeper-hosts", testHost)
-    .arg("crossref-input", input)
-    .arg("debug", "true")
-    .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
-      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
-    .source(TextLine(input), List(
-      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
-      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
-      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
-      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
-    .sink[(Int, String, String, String, String)](TypedTsv[(Int,
-    String, String, String, String)](output)) {
-      // Grobid titles: 
-      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
-      // crossref slugs: 
-      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
-      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
-      outputBuffer =>
-      "The pipeline" should "return a 4-element list" in {
-        outputBuffer should have length 4
-      }
-    }
-    .run
-    .finish
-}
-       */
+  "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+    val score = Scorable.computeSimilarity(
+      new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+    score shouldBe Scorable.MaxScore
   }
 }
-  
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
new file mode 100644
index 0000000..22cbdb8
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -0,0 +1,177 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreJobTest extends FlatSpec with Matchers {
+  val GrobidString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+  val MalformedGrobidString = GrobidString.replace("}", "")
+
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
+    "date-time" : "2017-10-23T17:19:16Z", 
+    "timestamp" : { "$numberLong" : "1508779156477" } }, 
+  "reference-count" : 0, 
+  "publisher" : "Elsevier BV", 
+  "issue" : "3", 
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
+                                "date-time" : "1996-01-01T00:00:00Z", 
+                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
+  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article", 
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
+    "date-time" : "2002-07-25T15:09:41Z", 
+    "timestamp" : { "$numberLong" : "1027609781000" } }, 
+  "page" : "186-187", 
+  "source" : "Crossref", 
+  "is-referenced-by-count" : 0, 
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016", 
+  "volume" : "9", 
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
+  "member" : "78", 
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" }, 
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ], 
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
+                  "date-time" : "2015-09-03T10:03:43Z", 
+                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
+  "score" : 1, 
+  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
+  "references-count" : 0, 
+  "alternative-id" : [ "0987-7983(96)87729-2" ], 
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
+  "ISSN" : [ "0987-7983" ], 
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+
+  //  Pipeline tests
+  val output = "/tmp/testOutput"
+  val input = "/tmp/testInput"
+  val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+  val grobidSampleData = List(
+    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
+      Bytes.toBytes(MalformedGrobidString)))
+
+  // TODO: Make less yucky.
+  ScoreJob.setScorable1(new CrossrefScorable())
+  ScoreJob.setScorable2(new GrobidScorable())
+
+  JobTest("sandcrawler.ScoreJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost),
+      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .source(TextLine(input), List(
+      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+    .sink[ReduceOutput](TypedTsv[ReduceOutput](output)) {
+      // Grobid titles: 
+      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+      // crossref slugs: 
+      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
+      outputBuffer => 
+      "The pipeline" should "return a 4-element list" in {
+        outputBuffer should have length 4
+      }
+
+              /*
+      it should "return the right first entry" in {
+        outputBuffer(0) shouldBe ReduceOutput("slug", 50, "",
+          "")
+        val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
+        slug shouldBe "title 1"
+        slug shouldBe slug0
+        slug shouldBe slug1
+        sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
+        grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
+      }
+        */
+    }
+    .run
+    .finish
+}
-- 
cgit v1.2.3


From 31354b1a6062c5c56a30610f68fa48c82a7e83f0 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Sun, 12 Aug 2018 18:08:51 -0700
Subject: Tests pass.

---
 scalding/src/main/scala/sandcrawler/Scorable.scala | 11 +--
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 89 ----------------------
 .../scala/sandcrawler/GrobidScorableTest.scala     | 20 +++--
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 28 ++++---
 4 files changed, 39 insertions(+), 109 deletions(-)
 delete mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9c8da69..929461b 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -66,13 +66,14 @@ object Scorable {
   // This guarantees it will have all of the fields needed to compute
   // the ultimate score, which are a superset of those needed for a slug.
   def mapToSlug(map : Map[String, Any]) : String = {
-    val unaccented = StringUtilities.removeAccents(getString(map, "title"))
-    // Remove punctuation after splitting on colon.
-    val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
-    if (slug.isEmpty || slug == null) {
+    val title = getString(map, "title")
+    if (title == null) {
       NoSlug
     } else {
-      slug
+      val unaccented = StringUtilities.removeAccents(title)
+      // Remove punctuation after splitting on colon.
+      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+      if (slug.isEmpty || slug == null) NoSlug else slug
     }
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
deleted file mode 100644
index 1c35d66..0000000
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-package sandcrawler
-
-import cascading.tuple.Fields
-import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import org.scalatest._
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-
-class CrossrefScorableTest extends FlatSpec with Matchers {
-  val CrossrefString =
-"""
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
-                                "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
-  "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
-  "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
-  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
-               { "URL" :
-  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
-  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
-}
-"""
-  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
-  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
-  val MalformedCrossrefString = CrossrefString.replace("}", "")
-
-  // Unit tests
-  "simplifyJson()" should "return None for bad JSON" in {
-    CrossrefScorable.simplifyJson("") shouldBe None
-    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
-  }
-
-  it should "return None for JSON lacking title" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
-  }
-
-  it should "return appropriate result for valid JSON" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
-      case None => fail("None unexpectedly returned by simplifyJson")
-      case Some(map) => {
-        Scorable.isScorableMap(map) shouldBe true
-        map.size shouldBe 1
-        map.keys should contain ("title")
-        map("title") shouldBe "SomeTitle"
-      }
-    }
-  }
-}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 5bb955a..3fcd856 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers {
   val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
   val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
   val MalformedGrobidString = GrobidString.replace("}", "")
+  val Key = "Dummy Key"
 
   // Unit tests
 
   "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None
+    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) 
     result.slug shouldBe Scorable.NoSlug
-    result.json shouldBe MalformedGrobidString
   }
 
-  "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in {
-    val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None
+  it should "handle missing title" in {
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
     result.slug shouldBe Scorable.NoSlug
-    result.json shouldBe GrobidStringWithoutTitle
+  }
+
+  it should "handle valid input" in {
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+    result.slug shouldBe "dummyexamplefile"
+    Scorable.jsonToMap(result.json) match {
+      case None => fail()
+      case Some(map) => {
+        map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+      }
+    }
   }
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 2f80492..95faacc 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
+  private def titleToSlug(s : String) : String = {
+    Scorable.mapToSlug(Scorable.toScorableMap(title = s))
+  }
 
-  "titleToSlug()" should "extract the parts of titles before a colon" in {
-    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+  "mapToSlug()" should "extract the parts of titles before a colon" in {
+    titleToSlug("HELLO:there") shouldBe "hello"
   }
 
   it should "extract an entire colon-less string" in {
-    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+    titleToSlug("hello THERE") shouldBe "hellothere"
   }
 
   it should "return Scorable.NoSlug if given empty string" in {
-    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+    titleToSlug("") shouldBe Scorable.NoSlug
   }
 
   it should "return Scorable.NoSlug if given null" in {
-    Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
+    titleToSlug(null) shouldBe Scorable.NoSlug
+  }
+
+  it should "strip punctuation" in {
+    titleToSlug("HELLO!:the:re") shouldBe "hello"
+    titleToSlug("a:b:c") shouldBe "a"
+    titleToSlug(
+      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
   }
 
-  "titleToSlug()" should "strip punctuation" in {
-    Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
-    Scorable.titleToSlug("a:b:c") shouldBe "a"
-    Scorable.titleToSlug(
-      "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+  it should "remove whitespace" in {
+    titleToSlug("foo bar : baz ::") shouldBe "foobar"
+    titleToSlug("\na\t:b:c") shouldBe "a"
   }
 
   "jsonToMap()" should "return a map, given a legal JSON string" in {
-- 
cgit v1.2.3


From b4f1acce5eccbb56291f82906d9c01534c7f1506 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Mon, 13 Aug 2018 10:27:48 -0700
Subject: Factored out ScorableFeatures.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  |  7 ++--
 .../main/scala/sandcrawler/GrobidScorable.scala    |  6 +---
 scalding/src/main/scala/sandcrawler/Scorable.scala | 30 ------------------
 .../main/scala/sandcrawler/ScorableFeatures.scala  | 30 ++++++++++++++++++
 .../scala/sandcrawler/ScorableFeaturesTest.scala   | 37 ++++++++++++++++++++++
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 32 -------------------
 6 files changed, 70 insertions(+), 72 deletions(-)
 create mode 100644 scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
 create mode 100644 scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 4558ee6..4897b1c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -34,11 +34,8 @@ object CrossrefScorable {
           if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
             new MapFeatures(Scorable.NoSlug, json)
           } else {
-            val title = titles(0)
-            val map2 = Scorable.toScorableMap(title=title, doi=doi)
-            new MapFeatures(
-              Scorable.mapToSlug(map2),
-              JSONObject(map2).toString)
+            val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
+            new MapFeatures(sf.toSlug, sf.toString)
           }
         } else {
           new MapFeatures(Scorable.NoSlug, json)
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 94b3494..5ba7d58 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -35,11 +35,7 @@ object GrobidScorable {
       case None => MapFeatures(Scorable.NoSlug, json)
       case Some(map) => {
         if (map contains "title") {
-          val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"),
-            sha1=key)
-          new MapFeatures(
-            Scorable.mapToSlug(map2),
-            JSONObject(map2).toString)
+          new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures
         } else {
           MapFeatures(Scorable.NoSlug, json)
         }
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 717b2d5..9b9c633 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -36,21 +36,6 @@ object Scorable {
     slug != NoSlug
   }
 
-  // NOTE: I could go all out and make ScorableMap a type.
-  // TODO: Require year. Other features will get added here.
-  def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = {
-    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
-  }
-
-  def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = {
-    JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString
-  }
-
-  // TODO: Score on more fields than "title".
-  def isScorableMap(map : Map[String, Any]) : Boolean = {
-    map.contains("title")
-  }
-
   def jsonToMap(json : String) : Option[Map[String, Any]] = {
     // https://stackoverflow.com/a/32717262/631051
     val jsonObject = JSON.parseFull(json)
@@ -61,21 +46,6 @@ object Scorable {
     }
   }
 
-  // Map should have been produced by toScorableMap.
-  // This guarantees it will have all of the fields needed to compute
-  // the ultimate score, which are a superset of those needed for a slug.
-  def mapToSlug(map : Map[String, Any]) : String = {
-    val title = getString(map, "title")
-    if (title == null) {
-      NoSlug
-    } else {
-      val unaccented = StringUtilities.removeAccents(title)
-      // Remove punctuation after splitting on colon.
-      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
-      if (slug.isEmpty || slug == null) NoSlug else slug
-    }
-  }
-
   def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
     optionalMap match {
       case None => None
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..5d6dea0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,30 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures).
+class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+  def toMap() : Map[String, Any] = {
+    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+  }
+
+  override def toString() : String = {
+    JSONObject(toMap()).toString
+  }
+
+  def toSlug() : String = {
+    if (title == null) {
+      Scorable.NoSlug
+    } else {
+      val unaccented = StringUtilities.removeAccents(title)
+      // Remove punctuation after splitting on colon.
+      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+      if (slug.isEmpty || slug == null) Scorable.NoSlug else slug
+    }
+  }
+
+  def toMapFeatures = {
+    MapFeatures(toSlug, toString)
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
new file mode 100644
index 0000000..7ec0c4d
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -0,0 +1,37 @@
+package sandcrawler
+
+import org.scalatest._
+
+class ScorableFeaturesTest extends FlatSpec with Matchers {
+  private def titleToSlug(s : String) : String = {
+    new ScorableFeatures(title = s).toSlug
+  }
+
+  "mapToSlug()" should "extract the parts of titles before a colon" in {
+    titleToSlug("HELLO:there") shouldBe "hello"
+  }
+
+  it should "extract an entire colon-less string" in {
+    titleToSlug("hello THERE") shouldBe "hellothere"
+  }
+
+  it should "return Scorable.NoSlug if given empty string" in {
+    titleToSlug("") shouldBe Scorable.NoSlug
+  }
+
+  it should "return Scorable.NoSlug if given null" in {
+    titleToSlug(null) shouldBe Scorable.NoSlug
+  }
+
+  it should "strip punctuation" in {
+    titleToSlug("HELLO!:the:re") shouldBe "hello"
+    titleToSlug("a:b:c") shouldBe "a"
+    titleToSlug(
+      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
+  }
+
+  it should "remove whitespace" in {
+    titleToSlug("foo bar : baz ::") shouldBe "foobar"
+    titleToSlug("\na\t:b:c") shouldBe "a"
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 95faacc..fd44f57 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -54,38 +54,6 @@ class ScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
-  private def titleToSlug(s : String) : String = {
-    Scorable.mapToSlug(Scorable.toScorableMap(title = s))
-  }
-
-  "mapToSlug()" should "extract the parts of titles before a colon" in {
-    titleToSlug("HELLO:there") shouldBe "hello"
-  }
-
-  it should "extract an entire colon-less string" in {
-    titleToSlug("hello THERE") shouldBe "hellothere"
-  }
-
-  it should "return Scorable.NoSlug if given empty string" in {
-    titleToSlug("") shouldBe Scorable.NoSlug
-  }
-
-  it should "return Scorable.NoSlug if given null" in {
-    titleToSlug(null) shouldBe Scorable.NoSlug
-  }
-
-  it should "strip punctuation" in {
-    titleToSlug("HELLO!:the:re") shouldBe "hello"
-    titleToSlug("a:b:c") shouldBe "a"
-    titleToSlug(
-      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
-  }
-
-  it should "remove whitespace" in {
-    titleToSlug("foo bar : baz ::") shouldBe "foobar"
-    titleToSlug("\na\t:b:c") shouldBe "a"
-  }
-
   "jsonToMap()" should "return a map, given a legal JSON string" in {
     Scorable.jsonToMap(JsonString) should not be (None)
   }
-- 
cgit v1.2.3


From 3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 14 Aug 2018 20:38:29 -0700
Subject: Fixed style problems (or disabled warning when appropriate) for
 tests.

---
 scalding/build.sbt                                 |  7 ++
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 87 ++++++++++---------
 .../scala/sandcrawler/GrobidScorableTest.scala     |  7 +-
 .../test/scala/sandcrawler/HBaseBuilderTest.scala  |  1 +
 .../scala/sandcrawler/HBaseMimeCountTest.scala     |  9 +-
 .../test/scala/sandcrawler/HBaseRowCountTest.scala | 11 +--
 .../scala/sandcrawler/HBaseStatusCountTest.scala   | 10 ++-
 .../scala/sandcrawler/ScorableFeaturesTest.scala   |  1 +
 .../src/test/scala/sandcrawler/ScorableTest.scala  |  5 +-
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  | 97 ++++++++++++----------
 10 files changed, 135 insertions(+), 100 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/ScorableTest.scala')

diff --git a/scalding/build.sbt b/scalding/build.sbt
index 2addd60..d477399 100644
--- a/scalding/build.sbt
+++ b/scalding/build.sbt
@@ -20,6 +20,13 @@ lazy val root = (project in file(".")).
       scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
     },
 
+    (scalastyleSources in Test) := {
+      // all .scala files in "src/test/scala"
+      val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get    
+      val dirNameToExclude = "/example/"
+      scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
+    },
+
     name := "sandcrawler",
 
     resolvers += "conjars.org" at "http://conjars.org/repo",
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 75be03e..e171dba 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -2,72 +2,77 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class CrossrefScorableTest extends FlatSpec with Matchers {
+  // scalastyle:off
   val CrossrefString =
 """
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
                                 "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
   "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
   "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
   "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
+               "content-type" : "text/xml",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
+                 "intended-application" : "text-mining" },
                { "URL" :
   "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
+                 "content-type" : "text/plain",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
+  // scalastyle:on
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
   // Unit tests
   "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) 
+    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString)
     result.slug shouldBe Scorable.NoSlug
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 4b958b9..661824b 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -2,7 +2,10 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
@@ -62,7 +65,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {
   // Unit tests
 
   "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) 
+    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
     result.slug shouldBe Scorable.NoSlug
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
index 603a4c7..c61cb22 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
@@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers {
     fields should have length 0
   }
 
+  //scalastyle:off no.whitespace.before.left.bracket
   it should "throw IllegalArgumentException on malformed input" in {
     a [IllegalArgumentException] should be thrownBy {
       HBaseBuilder.parseColSpecs(List("file_size"))
diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
index fde2290..d6d283f 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
index 3424a36..c4ca5aa 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 /**
@@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
       outputBuffer =>
 
         it("should return the test data provided.") {
-          println("outputBuffer.size => " + outputBuffer.size)
           assert(outputBuffer.size === 1)
         }
 
         it("should return the correct count") {
-          println("raw output => " + outputBuffer)
           assert(outputBuffer(0).getObject(0) === 8)
         }
     }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index 8a71f31..fe3ff21 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -1,15 +1,19 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 7ec0c4d..f9c30a2 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -2,6 +2,7 @@ package sandcrawler
 
 import org.scalatest._
 
+// scalastyle:off null
 class ScorableFeaturesTest extends FlatSpec with Matchers {
   private def titleToSlug(s : String) : String = {
     new ScorableFeatures(title = s).toSlug
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index fd44f57..f63bef8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -2,7 +2,10 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 1c6ae83..34081a5 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -2,13 +2,17 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class ScoreJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
   val JsonString = """
 {
   "title": "<<TITLE>>",
@@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
+  // scalastyle:on
   val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
   val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
   val MalformedJsonString = JsonString.replace("}", "")
 
+  // scalastyle:off
   val CrossrefString =
 """
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
                                 "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
   "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
   "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
   "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
+               "content-type" : "text/xml",
+               "content-version" : "vor",
+               "intended-application" : "text-mining" },
                { "URL" :
   "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
+                 "content-type" : "text/plain",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
+  // scalastyle:on
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
       2 -> CrossrefStrings(2),
       3 -> CrossrefStrings(3)))
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
-      // Grobid titles and slugs (in parentheses): 
+      // Grobid titles and slugs (in parentheses):
       //   Title 1                       (title1)
       //   Title 2: TNG                  (title2)
       //   Title 3: The Sequel           (title3)
@@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
       //   Title 1: TNG 3                (title1)
       //   Title 2: Rebooted             (title2)
       // Join should have 3 "title1" slugs and 1 "title2" slug
-      outputBuffer => 
+      outputBuffer =>
       "The pipeline" should "return a 4-element list" in {
         outputBuffer should have length 4
       }
@@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
         countMap("title2") shouldBe 1
       }
 
-      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = {
+      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
         val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
-          Sha1Strings(grobidIndex), 
+          Sha1Strings(grobidIndex),
           JsonStrings(grobidIndex))
         val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
           CrossrefStrings(crossrefIndex))
-- 
cgit v1.2.3