From 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 7 Aug 2018 10:57:10 -0700
Subject: Added CrossrefScorableTest, minor cleanups.

---
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 84 ++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
new file mode 100644
index 0000000..5973ce5
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -0,0 +1,84 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class CrossrefScorableTest extends FlatSpec with Matchers {
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
+    "date-time" : "2017-10-23T17:19:16Z", 
+    "timestamp" : { "$numberLong" : "1508779156477" } }, 
+  "reference-count" : 0, 
+  "publisher" : "Elsevier BV", 
+  "issue" : "3", 
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
+                                "date-time" : "1996-01-01T00:00:00Z", 
+                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
+  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article", 
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
+    "date-time" : "2002-07-25T15:09:41Z", 
+    "timestamp" : { "$numberLong" : "1027609781000" } }, 
+  "page" : "186-187", 
+  "source" : "Crossref", 
+  "is-referenced-by-count" : 0, 
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016", 
+  "volume" : "9", 
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
+  "member" : "78", 
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" }, 
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ], 
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
+                  "date-time" : "2015-09-03T10:03:43Z", 
+                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
+  "score" : 1, 
+  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
+  "references-count" : 0, 
+  "alternative-id" : [ "0987-7983(96)87729-2" ], 
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
+  "ISSN" : [ "0987-7983" ], 
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+
+  // Unit tests
+
+  "crossrefToSlug()" should "get the right slug for a crossref json string" in {
+    val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle)
+    slug should contain ("sometitle")
+  }
+
+  it should "return None if given json string without title" in {
+    val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle)
+    slug shouldBe None
+  }
+
+  it should "return None if given a malformed json string" in {
+    val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString)
+    slug shouldBe None
+  }
+}
-- 
cgit v1.2.3


From 768e7ef0d127cf55119543be6e656751704ca5b2 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Fri, 10 Aug 2018 20:49:44 -0700
Subject: Tests pass. Still have changes to do but made huge progress.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  | 38 +++++++++++--------
 scalding/src/main/scala/sandcrawler/ScoreJob.scala | 44 +++-------------------
 .../scala/sandcrawler/CrossrefScorableTest.scala   |  3 +-
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  |  2 +-
 4 files changed, 30 insertions(+), 57 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 817bee5..b2f6537 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -9,6 +9,7 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
 import TDsl._
+import scala.util.parsing.json.JSONObject
 
 import java.text.Normalizer
 import java.util.Arrays
@@ -31,7 +32,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
 
 class CrossrefScorable extends Scorable with HBasePipeConversions {
-  // TODO: Generalize args so there can be multiple Grobid pipes in one job.
+  // TODO: Generalize args so there can be multiple Crossref pipes in one job.
   def getSource(args : Args) : Source = {
     TextLine(args("crossref-input"))
   }
@@ -39,26 +40,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
   def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
     getSource(args).read
       .toTypedPipe[String](new Fields("line"))
-      .map{ json : String =>
-        CrossrefScorable.crossrefToSlug(json) match {
-          case Some(slug) => new MapFeatures(slug, json)
+      .map{ json : String => 
+        CrossrefScorable.simplifyJson(json) match {
           case None => new MapFeatures(Scorable.NoSlug, json)
+          case Some(map) => new MapFeatures(
+            Scorable.titleToSlug(map("title").asInstanceOf[String]), 
+            JSONObject(map).toString)
         }
       }
   }
-}
 
-object CrossrefScorable {
-  def crossrefToSlug(json : String) : Option[String] = {
-    Scorable.jsonToMap(json) match {
-      case None => None
-      case Some(map) => {
-        if (map contains "title") {
-          // TODO: Don't ignore titles after the first.
-          val title = map("title").asInstanceOf[List[String]](0)
-          Some(Scorable.titleToSlug(title))
-        } else {
-          None
+  object CrossrefScorable {
+    def simplifyJson(json : String) : Option[Map[String, Any]] = {
+      Scorable.jsonToMap(json) match {
+        case None => None
+        case Some(map) => {
+          if (map contains "title") {
+            val titles = map("title").asInstanceOf[List[String]]
+            if (titles.isEmpty) {
+              None
+            } else {
+              Some(Map("title" -> titles(0)))
+            }
+          } else {
+            None
+          }
         }
       }
     }
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index bc5bf87..386b367 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -3,7 +3,7 @@ package sandcrawler
 import cascading.flow.FlowDef
 import cascading.tuple.Fields
 import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv}
-//import com.twitter.scalding.typed.TDsl._
+//import com.twitter.scalding.source.TypedText
 import parallelai.spyglass.base.JobBase
 import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
@@ -13,7 +13,7 @@ import cascading.pipe.Pipe
 class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
   // TODO: Instantiate any subclass of Scorable specified in args.
   val sc1 : Scorable = new GrobidScorable()
-  val sc2 : Scorable = new GrobidScorable()
+  val sc2 : Scorable = new CrossrefScorable()
   val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)
   val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args)
 
@@ -25,44 +25,10 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
       features1.json,
       features2.json)
   }
-    .write(TypedTsv[ReduceOutput](args("output")))
-
-  /*
-  val grobidSource = HBaseCrossrefScore.getHBaseSource(
-    args("hbase-table"),
-    args("zookeeper-hosts"))
-
-  val source0 : Source = TextLine("foo")
-  val pipe0 : cascading.pipe.Pipe = source0.read
-  // This compiles:
-  val pipe00 : TypedPipe[String] = getFeaturesPipe0(pipe0)
-
-  // Calling a method within ScoreJob compiles fine.
-  def getFeaturesPipe0(pipe : cascading.pipe.Pipe) : TypedPipe[String] = {
-    pipe
-    // This compiles:
-      .toTypedPipe[String](new Fields("line"))
-  }
-
-  // Calling a function in a ScoreJob object leads to a compiler error.
-  val source1 : Source = TextLine("foo")
-  val pipe1 : cascading.pipe.Pipe = source1.read
-  // This leads to a compile error:
-  val pipe11 : TypedPipe[String] = ScoreJob.getFeaturesPipe1(pipe0)
-
-  val pipe : cascading.pipe.Pipe = grobidSource
-    .read
-  val grobidPipe : TypedPipe[(String, String)] = pipe
-    .fromBytesWritable(new Fields("key", "tei_json"))
-  // Here I CAN call Pipe.toTypedPipe()
-    .toTypedPipe[(String, String)]('key, 'tei_json)
-    .write(TypedTsv[(String, String)](args("output")))
-
-  // Let's try making a method call.
-//  ScoreJob.etFeaturesPipe(pipe)
-
-   */
+  //TypedTsv doesn't work over case classes.
+    .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
 
+    .write(TypedTsv[(String, Int, String, String)](args("output")))
 }
 
 // Ugly hack to get non-String information into ScoreJob above.
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 5973ce5..67a8bfe 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,7 +66,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
   // Unit tests
-
+/*
   "crossrefToSlug()" should "get the right slug for a crossref json string" in {
     val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle)
     slug should contain ("sometitle")
@@ -81,4 +81,5 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
     val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString)
     slug shouldBe None
   }
+ */
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 22cbdb8..8acb454 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -148,7 +148,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
       1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
       2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
       3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
-    .sink[ReduceOutput](TypedTsv[ReduceOutput](output)) {
+    .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
       // Grobid titles: 
       //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
       // crossref slugs: 
-- 
cgit v1.2.3


From 728e50a33cec921c9a624439f2e1c8561a6e12ce Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Sat, 11 Aug 2018 21:03:53 -0700
Subject: It compiles.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  | 54 ++++++++++++++--------
 .../main/scala/sandcrawler/GrobidScorable.scala    | 21 ++++-----
 scalding/src/main/scala/sandcrawler/Scorable.scala | 40 +++++++++++-----
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 26 ++++++-----
 .../scala/sandcrawler/GrobidScorableTest.scala     | 19 ++++----
 5 files changed, 96 insertions(+), 64 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index b2f6537..5113b0c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -18,6 +18,7 @@ import java.util.regex.Pattern
 
 import scala.math
 import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
 
 import cascading.tuple.Fields
 import com.twitter.scalding._
@@ -40,33 +41,48 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
   def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
     getSource(args).read
       .toTypedPipe[String](new Fields("line"))
-      .map{ json : String => 
-        CrossrefScorable.simplifyJson(json) match {
-          case None => new MapFeatures(Scorable.NoSlug, json)
-          case Some(map) => new MapFeatures(
-            Scorable.titleToSlug(map("title").asInstanceOf[String]), 
-            JSONObject(map).toString)
+      .map{ json : String =>
+        Scorable.jsonToMap(json) match {
+          case None => MapFeatures(Scorable.NoSlug, json)
+          case Some(map) => {
+            if ((map contains "title") && (map contains "DOI")) {
+              val titles = map("title").asInstanceOf[List[String]]
+              if (titles.isEmpty) {
+                new MapFeatures(Scorable.NoSlug, json)
+              } else {
+                val title = titles(0)
+                val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String])
+                new MapFeatures(
+                  Scorable.mapToSlug(map2),
+                  JSONObject(map2).toString)
+              }
+            } else {
+              new MapFeatures(Scorable.NoSlug, json)
+            }
+          }
         }
       }
   }
+}
 
-  object CrossrefScorable {
-    def simplifyJson(json : String) : Option[Map[String, Any]] = {
-      Scorable.jsonToMap(json) match {
-        case None => None
-        case Some(map) => {
-          if (map contains "title") {
-            val titles = map("title").asInstanceOf[List[String]]
-            if (titles.isEmpty) {
-              None
-            } else {
-              Some(Map("title" -> titles(0)))
-            }
-          } else {
+/*
+object CrossrefScorable {
+  def simplifyJson(json : String) : Option[Map[String, Any]] = {
+    Scorable.jsonToMap(json) match {
+      case None => None
+      case Some(map) => {
+        if (map contains "title") {
+          val titles = map("title").asInstanceOf[List[String]]
+          if (titles.isEmpty) {
             None
+          } else {
+            Some(Map("title" -> titles(0)))
           }
+        } else {
+          None
         }
       }
     }
   }
 }
+ */
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 61055f2..de9f51a 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -1,5 +1,6 @@
 package sandcrawler
 
+import scala.util.parsing.json.JSONObject
 import cascading.flow.FlowDef
 import cascading.pipe.Pipe
 import cascading.tuple.Fields
@@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
       .read
       .fromBytesWritable(new Fields("key", "tei_json"))
       .toTypedPipe[(String, String)](new Fields("key", "tei_json"))
-      .map { entry =>
-        val (key : String, json : String) = (entry._1, entry._2)
-        GrobidScorable.grobidToSlug(json) match {
-          case Some(slug) => new MapFeatures(slug, json)
-          case None => new MapFeatures(Scorable.NoSlug, json)
-        }
-      }
+      .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
   }
 }
 
@@ -36,14 +31,18 @@ object GrobidScorable {
     HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL)
   }
 
-  def grobidToSlug(json : String) : Option[String] = {
+  def jsonToMapFeatures(key : String, json : String) : MapFeatures = {
     Scorable.jsonToMap(json) match {
-      case None => None
+      case None => MapFeatures(Scorable.NoSlug, json)
       case Some(map) => {
         if (map contains "title") {
-          Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+          val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"),
+            sha1=key)
+          new MapFeatures(
+            Scorable.mapToSlug(map2),
+            JSONObject(map2).toString)
         } else {
-          None
+          MapFeatures(Scorable.NoSlug, json)
         }
       }
     }
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 0ec8e46..9c8da69 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -2,6 +2,7 @@ package sandcrawler
 
 import scala.math
 import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
 
 import cascading.flow.FlowDef
 import com.twitter.scalding._
@@ -36,6 +37,21 @@ object Scorable {
     slug != NoSlug
   }
 
+  // NOTE: I could go all out and make ScorableMap a type.
+  // TODO: Require year. Other features will get added here.
+  def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = {
+   Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+  }
+
+  def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = {
+    JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString
+  }
+
+  // TODO: Score on more fields than "title".
+  def isScorableMap(map : Map[String, Any]) : Boolean = {
+    map.contains("title")
+  }
+
   def jsonToMap(json : String) : Option[Map[String, Any]] = {
     // https://stackoverflow.com/a/32717262/631051
     val jsonObject = JSON.parseFull(json)
@@ -46,18 +62,17 @@ object Scorable {
     }
   }
 
-  def titleToSlug(title : String) : String = {
-    if (title == null || title.isEmpty) {
+  // Map should have been produced by toScorableMap.
+  // This guarantees it will have all of the fields needed to compute
+  // the ultimate score, which are a superset of those needed for a slug.
+  def mapToSlug(map : Map[String, Any]) : String = {
+    val unaccented = StringUtilities.removeAccents(getString(map, "title"))
+    // Remove punctuation after splitting on colon.
+    val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
+    if (slug.isEmpty || slug == null) {
       NoSlug
     } else {
-      val unaccented = StringUtilities.removeAccents(title)
-      // Remove punctuation after splitting on colon.
-      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
-      if (slug.isEmpty || slug == null) {
-        NoSlug
-      } else {
-        slug
-      }
+      slug
     }
   }
 
@@ -68,8 +83,9 @@ object Scorable {
     }
   }
 
-  // Caller is responsible for ensuring that key is in map.
-  def getString(map : Map[String, String], key : String) : String = {
+  // Caller is responsible for ensuring that key is a String in map.
+  // TODO: Add and handle ClassCastException
+  def getString(map : Map[String, Any], key : String) : String = {
     assert(map contains key)
     map(key).asInstanceOf[String]
   }
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 67a8bfe..1c35d66 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,20 +66,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
   // Unit tests
-/*
-  "crossrefToSlug()" should "get the right slug for a crossref json string" in {
-    val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle)
-    slug should contain ("sometitle")
+  "simplifyJson()" should "return None for bad JSON" in {
+    CrossrefScorable.simplifyJson("") shouldBe None
+    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
   }
 
-  it should "return None if given json string without title" in {
-    val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle)
-    slug shouldBe None
+  it should "return None for JSON lacking title" in {
+    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
   }
 
-  it should "return None if given a malformed json string" in {
-    val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString)
-    slug shouldBe None
+  it should "return appropriate result for valid JSON" in {
+    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
+      case None => fail("None unexpectedly returned by simplifyJson")
+      case Some(map) => {
+        Scorable.isScorableMap(map) shouldBe true
+        map.size shouldBe 1
+        map.keys should contain ("title")
+        map("title") shouldBe "SomeTitle"
+      }
+    }
   }
- */
 }
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 7777610..5bb955a 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -60,18 +60,15 @@ class GrobidScorableTest extends FlatSpec with Matchers {
 
   // Unit tests
 
-  "grobidToSlug()" should "get the right slug for a grobid json string" in {
-    val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle)
-    slug should contain ("dummy example file")
+  "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+    val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None
+    result.slug shouldBe Scorable.NoSlug
+    result.json shouldBe MalformedGrobidString
   }
 
-  it should "return None if given json string without title" in {
-    val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle)
-    slug shouldBe None
-  }
-
-  it should "return None if given a malformed json string" in {
-    val slug = GrobidScorable.grobidToSlug(MalformedGrobidString)
-    slug shouldBe None
+  "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in {
+    val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None
+    result.slug shouldBe Scorable.NoSlug
+    result.json shouldBe GrobidStringWithoutTitle
   }
 }
-- 
cgit v1.2.3


From 31354b1a6062c5c56a30610f68fa48c82a7e83f0 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Sun, 12 Aug 2018 18:08:51 -0700
Subject: Tests pass.

---
 scalding/src/main/scala/sandcrawler/Scorable.scala | 11 +--
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 89 ----------------------
 .../scala/sandcrawler/GrobidScorableTest.scala     | 20 +++--
 .../src/test/scala/sandcrawler/ScorableTest.scala  | 28 ++++---
 4 files changed, 39 insertions(+), 109 deletions(-)
 delete mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9c8da69..929461b 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -66,13 +66,14 @@ object Scorable {
   // This guarantees it will have all of the fields needed to compute
   // the ultimate score, which are a superset of those needed for a slug.
   def mapToSlug(map : Map[String, Any]) : String = {
-    val unaccented = StringUtilities.removeAccents(getString(map, "title"))
-    // Remove punctuation after splitting on colon.
-    val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
-    if (slug.isEmpty || slug == null) {
+    val title = getString(map, "title")
+    if (title == null) {
       NoSlug
     } else {
-      slug
+      val unaccented = StringUtilities.removeAccents(title)
+      // Remove punctuation after splitting on colon.
+      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+      if (slug.isEmpty || slug == null) NoSlug else slug
     }
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
deleted file mode 100644
index 1c35d66..0000000
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-package sandcrawler
-
-import cascading.tuple.Fields
-import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import org.scalatest._
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-
-class CrossrefScorableTest extends FlatSpec with Matchers {
-  val CrossrefString =
-"""
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
-                                "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
-  "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
-  "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
-  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
-               { "URL" :
-  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
-  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
-}
-"""
-  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
-  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
-  val MalformedCrossrefString = CrossrefString.replace("}", "")
-
-  // Unit tests
-  "simplifyJson()" should "return None for bad JSON" in {
-    CrossrefScorable.simplifyJson("") shouldBe None
-    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
-  }
-
-  it should "return None for JSON lacking title" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
-  }
-
-  it should "return appropriate result for valid JSON" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
-      case None => fail("None unexpectedly returned by simplifyJson")
-      case Some(map) => {
-        Scorable.isScorableMap(map) shouldBe true
-        map.size shouldBe 1
-        map.keys should contain ("title")
-        map("title") shouldBe "SomeTitle"
-      }
-    }
-  }
-}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 5bb955a..3fcd856 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,18 +57,28 @@ class GrobidScorableTest extends FlatSpec with Matchers {
   val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
   val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
   val MalformedGrobidString = GrobidString.replace("}", "")
+  val Key = "Dummy Key"
 
   // Unit tests
 
   "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None
+    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) 
     result.slug shouldBe Scorable.NoSlug
-    result.json shouldBe MalformedGrobidString
   }
 
-  "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in {
-    val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None
+  it should "handle missing title" in {
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
     result.slug shouldBe Scorable.NoSlug
-    result.json shouldBe GrobidStringWithoutTitle
+  }
+
+  it should "handle valid input" in {
+    val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+    result.slug shouldBe "dummyexamplefile"
+    Scorable.jsonToMap(result.json) match {
+      case None => fail()
+      case Some(map) => {
+        map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+      }
+    }
   }
 }
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 2f80492..95faacc 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -54,28 +54,36 @@ class ScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
+  private def titleToSlug(s : String) : String = {
+    Scorable.mapToSlug(Scorable.toScorableMap(title = s))
+  }
 
-  "titleToSlug()" should "extract the parts of titles before a colon" in {
-    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+  "mapToSlug()" should "extract the parts of titles before a colon" in {
+    titleToSlug("HELLO:there") shouldBe "hello"
   }
 
   it should "extract an entire colon-less string" in {
-    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+    titleToSlug("hello THERE") shouldBe "hellothere"
   }
 
   it should "return Scorable.NoSlug if given empty string" in {
-    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+    titleToSlug("") shouldBe Scorable.NoSlug
   }
 
   it should "return Scorable.NoSlug if given null" in {
-    Scorable.titleToSlug(null) shouldBe Scorable.NoSlug
+    titleToSlug(null) shouldBe Scorable.NoSlug
+  }
+
+  it should "strip punctuation" in {
+    titleToSlug("HELLO!:the:re") shouldBe "hello"
+    titleToSlug("a:b:c") shouldBe "a"
+    titleToSlug(
+      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
   }
 
-  "titleToSlug()" should "strip punctuation" in {
-    Scorable.titleToSlug("HELLO!:the:re") shouldBe "hello"
-    Scorable.titleToSlug("a:b:c") shouldBe "a"
-    Scorable.titleToSlug(
-      "If you're happy and you know it, clap your hands!") shouldBe "if youre happy and you know it clap your hands"
+  it should "remove whitespace" in {
+    titleToSlug("foo bar : baz ::") shouldBe "foobar"
+    titleToSlug("\na\t:b:c") shouldBe "a"
   }
 
   "jsonToMap()" should "return a map, given a legal JSON string" in {
-- 
cgit v1.2.3


From 05c0213547f29842bbae6faaf77e983a364d4a2e Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Sun, 12 Aug 2018 18:41:27 -0700
Subject: Added back file I shouldn't have deleted.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  | 22 ------
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 89 ++++++++++++++++++++++
 2 files changed, 89 insertions(+), 22 deletions(-)
 create mode 100644 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5113b0c..667a5cc 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -64,25 +64,3 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
       }
   }
 }
-
-/*
-object CrossrefScorable {
-  def simplifyJson(json : String) : Option[Map[String, Any]] = {
-    Scorable.jsonToMap(json) match {
-      case None => None
-      case Some(map) => {
-        if (map contains "title") {
-          val titles = map("title").asInstanceOf[List[String]]
-          if (titles.isEmpty) {
-            None
-          } else {
-            Some(Map("title" -> titles(0)))
-          }
-        } else {
-          None
-        }
-      }
-    }
-  }
-}
- */
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
new file mode 100644
index 0000000..1c35d66
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -0,0 +1,89 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class CrossrefScorableTest extends FlatSpec with Matchers {
+  val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
+    "date-time" : "2017-10-23T17:19:16Z", 
+    "timestamp" : { "$numberLong" : "1508779156477" } }, 
+  "reference-count" : 0, 
+  "publisher" : "Elsevier BV", 
+  "issue" : "3", 
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
+                                "date-time" : "1996-01-01T00:00:00Z", 
+                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+                                "delay-in-days" : 0, "content-version" : "tdm" }],
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
+  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "DOI" : "<<DOI>>",
+  "type" : "journal-article", 
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
+    "date-time" : "2002-07-25T15:09:41Z", 
+    "timestamp" : { "$numberLong" : "1027609781000" } }, 
+  "page" : "186-187", 
+  "source" : "Crossref", 
+  "is-referenced-by-count" : 0, 
+  "title" : [ "<<TITLE>>" ],
+  "prefix" : "10.1016", 
+  "volume" : "9", 
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
+  "member" : "78", 
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+               "content-type" : "text/xml", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" }, 
+               { "URL" :
+  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+                 "content-type" : "text/plain", 
+                 "content-version" : "vor",
+                 "intended-application" : "text-mining" } ], 
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
+                  "date-time" : "2015-09-03T10:03:43Z", 
+                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
+  "score" : 1, 
+  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
+  "references-count" : 0, 
+  "alternative-id" : [ "0987-7983(96)87729-2" ], 
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
+  "ISSN" : [ "0987-7983" ], 
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+  "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+  val MalformedCrossrefString = CrossrefString.replace("}", "")
+
+  // Unit tests
+  "simplifyJson()" should "return None for bad JSON" in {
+    CrossrefScorable.simplifyJson("") shouldBe None
+    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
+  }
+
+  it should "return None for JSON lacking title" in {
+    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
+  }
+
+  it should "return appropriate result for valid JSON" in {
+    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
+      case None => fail("None unexpectedly returned by simplifyJson")
+      case Some(map) => {
+        Scorable.isScorableMap(map) shouldBe true
+        map.size shouldBe 1
+        map.keys should contain ("title")
+        map("title") shouldBe "SomeTitle"
+      }
+    }
+  }
+}
-- 
cgit v1.2.3


From 5615428921a45ba6a2fb005b255a28dcbb83b13f Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Sun, 12 Aug 2018 19:12:32 -0700
Subject: Snapshot before changing Scorable to find bug.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  | 41 ++++++++++++----------
 scalding/src/main/scala/sandcrawler/Scorable.scala |  1 -
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 24 ++++++-------
 .../scala/sandcrawler/GrobidScorableTest.scala     |  1 +
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  | 15 +++++---
 5 files changed, 46 insertions(+), 36 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 667a5cc..e257152 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -41,26 +41,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
   def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
     getSource(args).read
       .toTypedPipe[String](new Fields("line"))
-      .map{ json : String =>
-        Scorable.jsonToMap(json) match {
-          case None => MapFeatures(Scorable.NoSlug, json)
-          case Some(map) => {
-            if ((map contains "title") && (map contains "DOI")) {
-              val titles = map("title").asInstanceOf[List[String]]
-              if (titles.isEmpty) {
-                new MapFeatures(Scorable.NoSlug, json)
-              } else {
-                val title = titles(0)
-                val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String])
-                new MapFeatures(
-                  Scorable.mapToSlug(map2),
-                  JSONObject(map2).toString)
-              }
-            } else {
-              new MapFeatures(Scorable.NoSlug, json)
-            }
+      .map { CrossrefScorable.jsonToMapFeatures(_) }
+  }
+}
+
+object CrossrefScorable {
+  def jsonToMapFeatures(json : String) : MapFeatures = {
+    Scorable.jsonToMap(json) match {
+      case None => MapFeatures(Scorable.NoSlug, json)
+      case Some(map) => {
+        if ((map contains "titles") && (map contains "DOI")) {
+          val titles = map("titles").asInstanceOf[List[String]]
+          val doi = Scorable.getString(map, "DOI")
+          if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
+            new MapFeatures(Scorable.NoSlug, json)
+          } else {
+            val title = titles(0)
+            val map2 = Scorable.toScorableMap(title=title, doi=doi)
+            new MapFeatures(
+              Scorable.mapToSlug(map2),
+              JSONObject(map2).toString)
           }
+        } else {
+          new MapFeatures(Scorable.NoSlug, json)
         }
       }
+    }
   }
 }
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 929461b..a256fa4 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -7,7 +7,6 @@ import scala.util.parsing.json.JSONObject
 import cascading.flow.FlowDef
 import com.twitter.scalding._
 import com.twitter.scalding.typed.TDsl._
-//import TDsl._
 
 case class MapFeatures(slug : String, json : String)
 case class ReduceFeatures(json : String)
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1c35d66..dc6f347 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,23 +66,23 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
   // Unit tests
-  "simplifyJson()" should "return None for bad JSON" in {
-    CrossrefScorable.simplifyJson("") shouldBe None
-    CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
+  "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) 
+    result.slug shouldBe Scorable.NoSlug
   }
 
-  it should "return None for JSON lacking title" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
+  it should "handle missing title" in {
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle)
+    result.slug shouldBe Scorable.NoSlug
   }
 
-  it should "return appropriate result for valid JSON" in {
-    CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
-      case None => fail("None unexpectedly returned by simplifyJson")
+  it should "handle valid input" in {
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+    result.slug shouldBe "dummyexamplefile"
+    Scorable.jsonToMap(result.json) match {
+      case None => fail()
       case Some(map) => {
-        Scorable.isScorableMap(map) shouldBe true
-        map.size shouldBe 1
-        map.keys should contain ("title")
-        map("title") shouldBe "SomeTitle"
+        map("title").asInstanceOf[String] shouldBe "Dummy Example File"
       }
     }
   }
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 3fcd856..4b958b9 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -77,6 +77,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {
     Scorable.jsonToMap(result.json) match {
       case None => fail()
       case Some(map) => {
+        map should contain key "title"
         map("title").asInstanceOf[String] shouldBe "Dummy Example File"
       }
     }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 8acb454..8436817 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -149,11 +149,16 @@ class ScoreJobTest extends FlatSpec with Matchers {
       2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
       3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
-      // Grobid titles: 
-      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
-      // crossref slugs: 
-      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
-      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
+      // Grobid titles and slugs (in parentheses): 
+      //   Title 1                       (title1)
+      //   Title 2: TNG                  (title2)
+      //   Title 3: The Sequel           (title3)
+      // crossref titles and slugs (in parentheses):
+      //   Title 1: TNG                  (title1)
+      //   Title 1: TNG 2                (title1)
+      //   Title 1: TNG 3                (title1)
+      //   Title 2 Rebooted              (title2rebooted)
+      // Join should have 3 "title1" slugs and 1 "title2" slug
       outputBuffer => 
       "The pipeline" should "return a 4-element list" in {
         outputBuffer should have length 4
-- 
cgit v1.2.3


From 1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Mon, 13 Aug 2018 09:58:27 -0700
Subject: Pipeline works, all tests pass, no scalastyle errors.

---
 .../main/scala/sandcrawler/CrossrefScorable.scala  |  28 +--
 .../main/scala/sandcrawler/GrobidScorable.scala    |   3 +-
 .../scala/sandcrawler/HBaseCrossrefScoreJob.scala  | 218 ---------------------
 scalding/src/main/scala/sandcrawler/Scorable.scala |   2 +-
 scalding/src/main/scala/sandcrawler/ScoreJob.scala |  51 +----
 .../scala/sandcrawler/CrossrefScorableTest.scala   |   6 +-
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  |  80 +++++---
 7 files changed, 65 insertions(+), 323 deletions(-)
 delete mode 100644 scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index e257152..4558ee6 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -1,36 +1,14 @@
 package sandcrawler
 
-import cascading.flow.FlowDef
-import cascading.pipe.Pipe
-import cascading.tuple.Fields
-import com.twitter.scalding._
-import com.twitter.scalding.typed.TDsl._
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-import parallelai.spyglass.hbase.HBasePipeConversions
-import parallelai.spyglass.hbase.HBaseSource
-import TDsl._
-import scala.util.parsing.json.JSONObject
-
-import java.text.Normalizer
-import java.util.Arrays
-import java.util.Properties
-import java.util.regex.Pattern
-
 import scala.math
 import scala.util.parsing.json.JSON
 import scala.util.parsing.json.JSONObject
 
+import cascading.flow.FlowDef
 import cascading.tuple.Fields
 import com.twitter.scalding._
-import com.twitter.scalding.typed.CoGrouped
-import com.twitter.scalding.typed.Grouped
 import com.twitter.scalding.typed.TDsl._
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import parallelai.spyglass.base.JobBase
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 import parallelai.spyglass.hbase.HBasePipeConversions
-import parallelai.spyglass.hbase.HBaseSource
 
 class CrossrefScorable extends Scorable with HBasePipeConversions {
   // TODO: Generalize args so there can be multiple Crossref pipes in one job.
@@ -50,8 +28,8 @@ object CrossrefScorable {
     Scorable.jsonToMap(json) match {
       case None => MapFeatures(Scorable.NoSlug, json)
       case Some(map) => {
-        if ((map contains "titles") && (map contains "DOI")) {
-          val titles = map("titles").asInstanceOf[List[String]]
+        if ((map contains "title") && (map contains "DOI")) {
+          val titles = map("title").asInstanceOf[List[String]]
           val doi = Scorable.getString(map, "DOI")
           if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
             new MapFeatures(Scorable.NoSlug, json)
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index de9f51a..94b3494 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -1,15 +1,14 @@
 package sandcrawler
 
 import scala.util.parsing.json.JSONObject
+
 import cascading.flow.FlowDef
-import cascading.pipe.Pipe
 import cascading.tuple.Fields
 import com.twitter.scalding._
 import com.twitter.scalding.typed.TDsl._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
-//import TDsl._
 
 class GrobidScorable extends Scorable with HBasePipeConversions {
   def getSource(args : Args) : Source = {
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
deleted file mode 100644
index 018a74b..0000000
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-package sandcrawler
-
-import java.text.Normalizer
-import java.util.Arrays
-import java.util.Properties
-import java.util.regex.Pattern
-
-import scala.math
-import scala.util.parsing.json.JSON
-
-import cascading.tuple.Fields
-import com.twitter.scalding._
-import com.twitter.scalding.typed.CoGrouped
-import com.twitter.scalding.typed.Grouped
-import com.twitter.scalding.typed.TDsl._
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.util.Bytes
-import parallelai.spyglass.base.JobBase
-import parallelai.spyglass.hbase.HBaseConstants.SourceMode
-import parallelai.spyglass.hbase.HBasePipeConversions
-import parallelai.spyglass.hbase.HBaseSource
-import TDsl._
-
-class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions {
-  val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
-
-  // key is SHA1
-  val grobidSource = HBaseCrossrefScore.getHBaseSource(
-    args("hbase-table"),
-    args("zookeeper-hosts"))
-
-  val temp : cascading.pipe.Pipe = grobidSource
-    .read
-
-  // Here I CAN call Pipe.toTypedPipe()
-  val grobidPipe : TypedPipe[(String, String, String)] = temp
-    .fromBytesWritable(new Fields("key", "tei_json"))
-    .toTypedPipe[(String, String)]('key, 'tei_json)
-    .map { entry =>
-      val (key, json) = (entry._1, entry._2)
-      HBaseCrossrefScore.grobidToSlug(json) match {
-        case Some(slug) => (slug, key, json)
-        case None => (NoTitle, key, json)
-      }
-    }
-    .filter { entry =>
-      val (slug, _, _) = entry
-      slug != NoTitle
-    }
-
-  val grobidGroup = grobidPipe
-    .groupBy { case (slug, key, json) => slug }
-
-  val crossrefSource = TextLine(args("crossref-input"))
-  val temp2 : cascading.pipe.Pipe = crossrefSource.read
-  val crossrefPipe : TypedPipe[(String, String)] = temp2
-    //    .debug // Should be 4 tuples for mocked data
-    .toTypedPipe[String]('line)
-    .map{ json : String =>
-      HBaseCrossrefScore.crossrefToSlug(json) match {
-        case Some(slug) => (slug, json)
-        case None => (NoTitle, json)
-      }
-    }
-    .filter { entry =>
-      val (slug, json) = entry
-      slug != NoTitle
-    }
-
-  val crossrefGroup = crossrefPipe
-  .groupBy { case (slug, json) => slug }
-
-  val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
-    grobidGroup.join(crossrefGroup)
-
-  theJoin.map{ entry =>
-    val (slug : String,
-      ((slug0: String, sha1 : String, grobidJson : String),
-        (slug1 : String, crossrefJson : String))) = entry
-    HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
-    // Output: score, sha1, doi, grobid title, crossref title
-    .write(TypedTsv[(Int, String, String, String, String)](args("output")))
-
-}
-
-object HBaseCrossrefScore {
-  def getHBaseSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = HBaseBuilder.build(
-    hbaseTable,      // HBase Table Name
-    zookeeperHosts,  // HBase Zookeeper server (to get runtime config info; can be array?)
-    List("grobid0:tei_json"),
-    SourceMode.SCAN_ALL)
-
-  def jsonToMap(json : String) : Option[Map[String, Any]] = {
-    // https://stackoverflow.com/a/32717262/631051
-    val jsonObject = JSON.parseFull(json)
-    if (jsonObject == None) {
-      None
-    } else {
-      Some(jsonObject.get.asInstanceOf[Map[String, Any]])
-    }
-  }
-
-  def grobidToSlug(json : String) : Option[String] = {
-    jsonToMap(json) match {
-      case None => None
-      case Some(map) => {
-        if (map contains "title") {
-          titleToSlug(map("title").asInstanceOf[String])
-        } else {
-          None
-        }
-      }
-    }
-  }
-
-  def crossrefToSlug(json : String) : Option[String] = {
-    jsonToMap(json) match {
-      case None => None
-      case Some(map) => {
-        if (map contains "title") {
-          // TODO: Don't ignore titles after the first.
-          titleToSlug(map("title").asInstanceOf[List[String]](0))
-        } else {
-          None
-        }
-      }
-    }
-  }
-
-  def titleToSlug(title : String) : Option[String] = {
-    val slug = removeAccents(title).split(":")(0).toLowerCase()
-    if (slug.isEmpty) {
-      None
-    } else {
-      Some(slug)
-    }
-  }
-
-  val MaxScore = 1000
-
-  def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
-    // (score, sha1, doi, grobidTitle, crossrefTitle)
-      (Int, String, String, String, String) = {
-    jsonToMap(grobidJson) match {
-      case None => (0, "", "", "", "")  // This can't happen, because grobidJson already validated in earlier stage
-      case Some(grobid) => {
-        val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
-
-        jsonToMap(crossrefJson) match {
-          case None => (0, "", "", "", "")  // This can't happen, because crossrefJson already validated in earlier stage
-          case Some(crossref) => {
-            val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
-
-            (similarity(removeAccents(grobidTitle), removeAccents(crossrefTitle)),
-              sha1,
-              crossref("DOI").asInstanceOf[String],
-              "'" + grobidTitle + "'",
-              "'" + crossrefTitle + "'")
-          }
-        }
-      }
-    }
-  }
-
-  // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
-  def removeAccents(s : String) : String = {
-    val replacements = Map(
-      '\u0141' -> 'L',
-      '\u0142' -> 'l',  // Letter ell
-      '\u00d8' -> 'O',
-      '\u00f8' -> 'o'
-    )
-    val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
-    for (i <- 0 to sb.length - 1) {
-      for (key <- replacements.keys) {
-        if (sb(i) == key) {
-          sb.deleteCharAt(i);
-          sb.insert(i, replacements(key))
-        }
-      }
-    }
-    val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
-    pattern.matcher(sb).replaceAll("")
-  }
-
-  // Adapted from: https://stackoverflow.com/a/16018452/631051
-  def similarity(s1 : String, s2 : String) : Int = {
-    val longer : String = if (s1.length > s2.length) s1 else s2
-    val shorter : String = if (s1.length > s2.length) s2 else s1
-    if (longer.length == 0) {
-      // Both strings are empty.
-      MaxScore
-    } else {
-      (longer.length - stringDistance(longer, shorter)) * MaxScore / longer.length
-    }
-  }
-
-  // Source: // https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
-  def stringDistance(s1: String, s2: String): Int = {
-    val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
-    def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
-    def sd(s1: List[Char], s2: List[Char]): Int = {
-      if (!memo.contains((s1, s2))) {
-        memo((s1,s2)) = (s1, s2) match {
-          case (_, Nil) => s1.length
-          case (Nil, _) => s2.length
-          case (c1::t1, c2::t2)  =>
-            min( sd(t1,s2) + 1, sd(s1,t2) + 1,
-              sd(t1,t2) + (if (c1==c2) 0 else 1) )
-        }
-      }
-      memo((s1,s2))
-    }
-
-    sd( s1.toList, s2.toList )
-  }
-}
-
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index a256fa4..717b2d5 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -39,7 +39,7 @@ object Scorable {
   // NOTE: I could go all out and make ScorableMap a type.
   // TODO: Require year. Other features will get added here.
   def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = {
-   Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
   }
 
   def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = {
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 386b367..75d45e9 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -1,16 +1,12 @@
 package sandcrawler
 
-import cascading.flow.FlowDef
-import cascading.tuple.Fields
-import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv}
-//import com.twitter.scalding.source.TypedText
-import parallelai.spyglass.base.JobBase
-import parallelai.spyglass.hbase.HBasePipeConversions
-import parallelai.spyglass.hbase.HBaseSource
-import com.twitter.scalding.{ Dsl, RichPipe, IterableSource, TupleSetter, TupleConverter }
 import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
 
-class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
+class ScoreJob(args: Args) extends JobBase(args) {
   // TODO: Instantiate any subclass of Scorable specified in args.
   val sc1 : Scorable = new GrobidScorable()
   val sc2 : Scorable = new CrossrefScorable()
@@ -27,10 +23,10 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
   }
   //TypedTsv doesn't work over case classes.
     .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
-
     .write(TypedTsv[(String, Int, String, String)](args("output")))
 }
 
+/*
 // Ugly hack to get non-String information into ScoreJob above.
 object ScoreJob {
   var scorable1 : Option[Scorable] = None
@@ -57,38 +53,5 @@ object ScoreJob {
       case None => null
     }
   }
-
-  /*
-  implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read)
-
-  // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields
-  implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe =
-    IterableSource[T](iter)(set, conv).read
-
-  implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe =
-    RichPipe(toPipe(iter)(set, conv))
-
-  // Provide args as an implicit val for extensions such as the Checkpoint extension.
-//  implicit protected def _implicitJobArgs: Args = args
-
-  def getFeaturesPipe1(pipe : cascading.pipe.Pipe) : TypedPipe[String] = {
-    pipe
-    // The next line gives an error: value toTypedPipe is not a member of cascading.pipe.Pipe
-      .toTypedPipe[String](new Fields("line"))
-  }
-
-  def getFeaturesPipe(pipe : cascading.pipe.Pipe) : TypedPipe[MapFeatures] = {
-    pipe
-      .fromBytesWritable(new Fields("key", "tei_json"))
-    // I needed to change symbols to strings when I pulled this out of ScoreJob.
-      .toTypedPipe[(String, String)](new Fields("key", "tei_json"))
-      .map { entry =>
-        val (key : String, json : String) = (entry._1, entry._2)
-        GrobidScorable.grobidToSlug(json) match {
-          case Some(slug) => new MapFeatures(slug, json)
-          case None => new MapFeatures(Scorable.NoSlug, json)
-        }
-      }
-  }
- */
 }
+ */
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index dc6f347..75be03e 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -61,7 +61,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
-  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
@@ -78,11 +78,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
 
   it should "handle valid input" in {
     val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
-    result.slug shouldBe "dummyexamplefile"
+    result.slug shouldBe "sometitle"
     Scorable.jsonToMap(result.json) match {
       case None => fail()
       case Some(map) => {
-        map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+        map("title").asInstanceOf[String] shouldBe "Some Title"
       }
     }
   }
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 8436817..f0b411f 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -113,25 +113,32 @@ class ScoreJobTest extends FlatSpec with Matchers {
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
+  val CrossrefStrings = List(
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
+    CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+    CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
 
   //  Pipeline tests
   val output = "/tmp/testOutput"
   val input = "/tmp/testInput"
   val (testTable, testHost) = ("test-table", "dummy-host:2181")
 
-  val grobidSampleData = List(
-    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
-    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
-    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
-      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
-    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
-      Bytes.toBytes(MalformedGrobidString)))
+  val Sha1Strings = List(
+    "sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q",
+    "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU",
+    "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
+    "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56")
 
-  // TODO: Make less yucky.
-  ScoreJob.setScorable1(new CrossrefScorable())
-  ScoreJob.setScorable2(new GrobidScorable())
+  val GrobidStrings = List(
+    GrobidString.replace("<<TITLE>>", "Title 1"),
+    GrobidString.replace("<<TITLE>>", "Title 2: TNG"),
+    GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"),
+    MalformedGrobidString)
+
+  val GrobidSampleData = (Sha1Strings zip GrobidStrings)
+    .map{case(s, g) =>
+      List(Bytes.toBytes(s), Bytes.toBytes(g))}
 
   JobTest("sandcrawler.ScoreJob")
     .arg("test", "")
@@ -142,12 +149,12 @@ class ScoreJobTest extends FlatSpec with Matchers {
     .arg("crossref-input", input)
     .arg("debug", "true")
     .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost),
-      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+      GrobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
     .source(TextLine(input), List(
-      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
-      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
-      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
-      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+      0 -> CrossrefStrings(0),
+      1 -> CrossrefStrings(1),
+      2 -> CrossrefStrings(2),
+      3 -> CrossrefStrings(3)))
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
       // Grobid titles and slugs (in parentheses): 
       //   Title 1                       (title1)
@@ -155,27 +162,40 @@ class ScoreJobTest extends FlatSpec with Matchers {
       //   Title 3: The Sequel           (title3)
       // crossref titles and slugs (in parentheses):
       //   Title 1: TNG                  (title1)
-      //   Title 1: TNG 2                (title1)
+      //   Title 1: TNG 2A               (title1)
       //   Title 1: TNG 3                (title1)
-      //   Title 2 Rebooted              (title2rebooted)
+      //   Title 2: Rebooted             (title2)
       // Join should have 3 "title1" slugs and 1 "title2" slug
       outputBuffer => 
       "The pipeline" should "return a 4-element list" in {
         outputBuffer should have length 4
       }
 
-              /*
-      it should "return the right first entry" in {
-        outputBuffer(0) shouldBe ReduceOutput("slug", 50, "",
-          "")
-        val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
-        slug shouldBe "title 1"
-        slug shouldBe slug0
-        slug shouldBe slug1
-        sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
-        grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
+      it should "has right # of entries with each slug" in {
+        val slugs = outputBuffer.map(_._1)
+        val countMap : Map[String, Int] = slugs.groupBy(identity).mapValues(_.size)
+        countMap("title1") shouldBe 3
+        countMap("title2") shouldBe 1
+      }
+
+      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = {
+        val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
+          Sha1Strings(grobidIndex), 
+          GrobidStrings(grobidIndex))
+        val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
+          CrossrefStrings(crossrefIndex))
+        val score = Scorable.computeSimilarity(
+          ReduceFeatures(mf1.json),
+          ReduceFeatures(mf2.json))
+        (slug, score, mf1.json, mf2.json)
+      }
+
+      it should "have right output values" in {
+        outputBuffer.exists(_ == bundle("title1", 0, 0))
+        outputBuffer.exists(_ == bundle("title1", 0, 2))
+        outputBuffer.exists(_ == bundle("title1", 0, 1))
+        outputBuffer.exists(_ == bundle("title2", 1, 3))
       }
-        */
     }
     .run
     .finish
-- 
cgit v1.2.3


From 3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 Mon Sep 17 00:00:00 2001
From: Ellen Spertus <ellen.spertus@gmail.com>
Date: Tue, 14 Aug 2018 20:38:29 -0700
Subject: Fixed style problems (or disabled warning when appropriate) for
 tests.

---
 scalding/build.sbt                                 |  7 ++
 .../scala/sandcrawler/CrossrefScorableTest.scala   | 87 ++++++++++---------
 .../scala/sandcrawler/GrobidScorableTest.scala     |  7 +-
 .../test/scala/sandcrawler/HBaseBuilderTest.scala  |  1 +
 .../scala/sandcrawler/HBaseMimeCountTest.scala     |  9 +-
 .../test/scala/sandcrawler/HBaseRowCountTest.scala | 11 +--
 .../scala/sandcrawler/HBaseStatusCountTest.scala   | 10 ++-
 .../scala/sandcrawler/ScorableFeaturesTest.scala   |  1 +
 .../src/test/scala/sandcrawler/ScorableTest.scala  |  5 +-
 .../src/test/scala/sandcrawler/ScoreJobTest.scala  | 97 ++++++++++++----------
 10 files changed, 135 insertions(+), 100 deletions(-)

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/build.sbt b/scalding/build.sbt
index 2addd60..d477399 100644
--- a/scalding/build.sbt
+++ b/scalding/build.sbt
@@ -20,6 +20,13 @@ lazy val root = (project in file(".")).
       scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
     },
 
+    (scalastyleSources in Test) := {
+      // all .scala files in "src/test/scala"
+      val scalaSourceFiles = ((scalaSource in Test).value ** "*.scala").get    
+      val dirNameToExclude = "/example/"
+      scalaSourceFiles.filterNot(_.getAbsolutePath.contains(dirNameToExclude))
+    },
+
     name := "sandcrawler",
 
     resolvers += "conjars.org" at "http://conjars.org/repo",
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 75be03e..e171dba 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -2,72 +2,77 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class CrossrefScorableTest extends FlatSpec with Matchers {
+  // scalastyle:off
   val CrossrefString =
 """
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
                                 "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
   "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
   "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
   "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
+               "content-type" : "text/xml",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
+                 "intended-application" : "text-mining" },
                { "URL" :
   "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
+                 "content-type" : "text/plain",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
+  // scalastyle:on
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
   // Unit tests
   "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) 
+    val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString)
     result.slug shouldBe Scorable.NoSlug
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 4b958b9..661824b 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -2,7 +2,10 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
@@ -62,7 +65,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {
   // Unit tests
 
   "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
-    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) 
+    val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
     result.slug shouldBe Scorable.NoSlug
   }
 
diff --git a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
index 603a4c7..c61cb22 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseBuilderTest.scala
@@ -22,6 +22,7 @@ class HBaseBuilderTest extends FlatSpec with Matchers {
     fields should have length 0
   }
 
+  //scalastyle:off no.whitespace.before.left.bracket
   it should "throw IllegalArgumentException on malformed input" in {
     a [IllegalArgumentException] should be thrownBy {
       HBaseBuilder.parseColSpecs(List("file_size"))
diff --git a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
index fde2290..d6d283f 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseMimeCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
index 3424a36..c4ca5aa 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
@@ -1,15 +1,18 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 /**
@@ -47,12 +50,10 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
       outputBuffer =>
 
         it("should return the test data provided.") {
-          println("outputBuffer.size => " + outputBuffer.size)
           assert(outputBuffer.size === 1)
         }
 
         it("should return the correct count") {
-          println("raw output => " + outputBuffer)
           assert(outputBuffer(0).getObject(0) === 8)
         }
     }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index 8a71f31..fe3ff21 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -1,15 +1,19 @@
 package sandcrawler
 
-import cascading.tuple.{Tuple, Fields}
-import com.twitter.scalding.{JobTest, Tsv, TypedTsv, TupleConversions}
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.Tsv
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.junit.runner.RunWith
 import org.scalatest.FunSpec
 import org.scalatest.junit.JUnitRunner
 import org.slf4j.LoggerFactory
-import parallelai.spyglass.hbase.HBaseSource
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBaseSource
 import scala._
 
 @RunWith(classOf[JUnitRunner])
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 7ec0c4d..f9c30a2 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -2,6 +2,7 @@ package sandcrawler
 
 import org.scalatest._
 
+// scalastyle:off null
 class ScorableFeaturesTest extends FlatSpec with Matchers {
   private def titleToSlug(s : String) : String = {
     new ScorableFeatures(title = s).toSlug
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index fd44f57..f63bef8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -2,7 +2,10 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 1c6ae83..34081a5 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -2,13 +2,17 @@ package sandcrawler
 
 import cascading.tuple.Fields
 import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
 import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class ScoreJobTest extends FlatSpec with Matchers {
+  //scalastyle:off
   val JsonString = """
 {
   "title": "<<TITLE>>",
@@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
+  // scalastyle:on
   val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
   val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
   val MalformedJsonString = JsonString.replace("}", "")
 
+  // scalastyle:off
   val CrossrefString =
 """
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, 
-  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], 
-    "date-time" : "2017-10-23T17:19:16Z", 
-    "timestamp" : { "$numberLong" : "1508779156477" } }, 
-  "reference-count" : 0, 
-  "publisher" : "Elsevier BV", 
-  "issue" : "3", 
-  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", 
-                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], 
-                                "date-time" : "1996-01-01T00:00:00Z", 
-                                "timestamp" : { "$numberLong" : "820454400000" } }, 
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+  "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+    "date-time" : "2017-10-23T17:19:16Z",
+    "timestamp" : { "$numberLong" : "1508779156477" } },
+  "reference-count" : 0,
+  "publisher" : "Elsevier BV",
+  "issue" : "3",
+  "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+                    "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+                                "date-time" : "1996-01-01T00:00:00Z",
+                                "timestamp" : { "$numberLong" : "820454400000" } },
                                 "delay-in-days" : 0, "content-version" : "tdm" }],
-  "content-domain" : { "domain" : [], "crossmark-restriction" : false }, 
-  "published-print" : { "date-parts" : [ [ 1996 ] ] }, 
+  "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+  "published-print" : { "date-parts" : [ [ 1996 ] ] },
   "DOI" : "<<DOI>>",
-  "type" : "journal-article", 
-  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], 
-    "date-time" : "2002-07-25T15:09:41Z", 
-    "timestamp" : { "$numberLong" : "1027609781000" } }, 
-  "page" : "186-187", 
-  "source" : "Crossref", 
-  "is-referenced-by-count" : 0, 
+  "type" : "journal-article",
+  "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+    "date-time" : "2002-07-25T15:09:41Z",
+    "timestamp" : { "$numberLong" : "1027609781000" } },
+  "page" : "186-187",
+  "source" : "Crossref",
+  "is-referenced-by-count" : 0,
   "title" : [ "<<TITLE>>" ],
-  "prefix" : "10.1016", 
-  "volume" : "9", 
-  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], 
-  "member" : "78", 
-  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ], 
+  "prefix" : "10.1016",
+  "volume" : "9",
+  "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+  "member" : "78",
+  "container-title" : [ "Journal de PÃ©diatrie et de PuÃ©riculture" ],
   "link" : [ { "URL" :  "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
-               "content-type" : "text/xml", 
-                 "content-version" : "vor",
-                 "intended-application" : "text-mining" }, 
+               "content-type" : "text/xml",
+               "content-version" : "vor",
+               "intended-application" : "text-mining" },
                { "URL" :
   "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
-                 "content-type" : "text/plain", 
+                 "content-type" : "text/plain",
                  "content-version" : "vor",
-                 "intended-application" : "text-mining" } ], 
-  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], 
-                  "date-time" : "2015-09-03T10:03:43Z", 
-                  "timestamp" : { "$numberLong" : "1441274623000" } }, 
-  "score" : 1, 
-  "issued" : { "date-parts" : [ [ 1996 ] ] }, 
-  "references-count" : 0, 
-  "alternative-id" : [ "0987-7983(96)87729-2" ], 
-  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", 
-  "ISSN" : [ "0987-7983" ], 
-  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], 
+                 "intended-application" : "text-mining" } ],
+  "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+                  "date-time" : "2015-09-03T10:03:43Z",
+                  "timestamp" : { "$numberLong" : "1441274623000" } },
+  "score" : 1,
+  "issued" : { "date-parts" : [ [ 1996 ] ] },
+  "references-count" : 0,
+  "alternative-id" : [ "0987-7983(96)87729-2" ],
+  "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+  "ISSN" : [ "0987-7983" ],
+  "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
   "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
 }
 """
+  // scalastyle:on
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
       2 -> CrossrefStrings(2),
       3 -> CrossrefStrings(3)))
     .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
-      // Grobid titles and slugs (in parentheses): 
+      // Grobid titles and slugs (in parentheses):
       //   Title 1                       (title1)
       //   Title 2: TNG                  (title2)
       //   Title 3: The Sequel           (title3)
@@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
       //   Title 1: TNG 3                (title1)
       //   Title 2: Rebooted             (title2)
       // Join should have 3 "title1" slugs and 1 "title2" slug
-      outputBuffer => 
+      outputBuffer =>
       "The pipeline" should "return a 4-element list" in {
         outputBuffer should have length 4
       }
@@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
         countMap("title2") shouldBe 1
       }
 
-      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = {
+      def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
         val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
-          Sha1Strings(grobidIndex), 
+          Sha1Strings(grobidIndex),
           JsonStrings(grobidIndex))
         val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
           CrossrefStrings(crossrefIndex))
-- 
cgit v1.2.3


From a3bf1d47fac53b818a8118020adced6c54be7cba Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 15 Aug 2018 20:22:44 -0700
Subject: crossref: test for empty-string title

---
 scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala')

diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index e171dba..1789d1a 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -67,6 +67,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
 """
   // scalastyle:on
   val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+  val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
   val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
   val MalformedCrossrefString = CrossrefString.replace("}", "")
 
@@ -81,6 +82,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
     result.slug shouldBe Scorable.NoSlug
   }
 
+  it should "handle empty title" in {
+    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
+    result.slug shouldBe Scorable.NoSlug
+  }
+
   it should "handle valid input" in {
     val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
     result.slug shouldBe "sometitle"
-- 
cgit v1.2.3