Added GrobidScorableTest, minor improvements.

author: Ellen Spertus <ellen.spertus@gmail.com> 2018-08-07 11:05:23 -0700
committer: Ellen Spertus <ellen.spertus@gmail.com> 2018-08-07 11:05:23 -0700
commit: 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch)
tree: f515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src
parent: 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff)
download: sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz
sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip
4 files changed, 179 insertions, 52 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 0849aff..cf5849c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable {
       .read
       .toTypedPipe[String](new Fields("line"))
       .map{ json : String =>
-        HBaseCrossrefScore.crossrefToSlug(json) match {
+        CrossrefScorable.crossrefToSlug(json) match {
           case Some(slug) => new MapFeatures(slug, json)
           case None => new MapFeatures(Scorable.NoSlug, json)
         }
       }
   }
 }
+
+object CrossrefScorable {
+  def crossrefToSlug(json : String) : Option[String] = {
+    Scorable.jsonToMap(json) match {
+      case None => None
+      case Some(map) => {
+        if (map contains "title") {
+          // TODO: Don't ignore titles after the first.
+          val title = map("title").asInstanceOf[List[String]](0)
+          Some(Scorable.titleToSlug(title))
+        } else {
+          None
+        }
+      }
+    }
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 8da7708..25e5985 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
       }
     }
   }
-/*
-  def fromBytesWritableLocal(f: Fields): Pipe = {
-	asList(f)
-	  .foldLeft(pipe) { (p, fld) => {
-	    p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable =>
-            Option(from).map(x => Bytes.toString(x.get)).getOrElse(null)
-          }
-      }}
+}
+
+object GrobidScorable {
+  def grobidToSlug(json : String) : Option[String] = {
+    Scorable.jsonToMap(json) match {
+      case None => None
+      case Some(map) => {
+        if (map contains "title") {
+          Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+        } else {
+          None
+        }
+      }
+    }
   }
- */
 }
+
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
new file mode 100644
index 0000000..7777610
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -0,0 +1,77 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class GrobidScorableTest extends FlatSpec with Matchers {
+  val GrobidString = """
+{
+  "title": "<<TITLE>>",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
+"""
+  val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+  val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+  val MalformedGrobidString = GrobidString.replace("}", "")
+
+  // Unit tests
+
+  "grobidToSlug()" should "get the right slug for a grobid json string" in {
+    val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle)
+    slug should contain ("dummy example file")
+  }
+
+  it should "return None if given json string without title" in {
+    val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle)
+    slug shouldBe None
+  }
+
+  it should "return None if given a malformed json string" in {
+    val slug = GrobidScorable.grobidToSlug(MalformedGrobidString)
+    slug shouldBe None
+  }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 9437fe6..8445073 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -9,7 +9,7 @@ import org.scalatest._
 import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 
 class ScorableTest extends FlatSpec with Matchers {
-  val JsonString = """
+      val JsonString = """
 {
   "title": "<<TITLE>>",
   "authors": [
@@ -54,59 +54,86 @@ class ScorableTest extends FlatSpec with Matchers {
   "annex": null
 }
 """
-  val MalformedJsonString = JsonString.replace("}", "")
 
-  "titleToSlug()" should "extract the parts of titles before a colon" in {
-    Scorable.titleToSlug("HELLO:there") shouldBe "hello"
-  }
+  performUnitTests()
+  performPipelineTests()
 
-  it should "extract an entire colon-less string" in {
-    Scorable.titleToSlug("hello THERE") shouldBe "hello there"
-  }
+  def performUnitTests() {
+    "titleToSlug()" should "extract the parts of titles before a colon" in {
+      Scorable.titleToSlug("HELLO:there") shouldBe "hello"
+    }
 
-  it should "return Scorable.NoSlug if given empty string" in {
-    Scorable.titleToSlug("") shouldBe Scorable.NoSlug
-  }
+    it should "extract an entire colon-less string" in {
+      Scorable.titleToSlug("hello THERE") shouldBe "hello there"
+    }
 
-  "jsonToMap()" should "return a map, given a legal JSON string" in {
-    Scorable.jsonToMap(JsonString) should not be (None)
-  }
+    it should "return Scorable.NoSlug if given empty string" in {
+      Scorable.titleToSlug("") shouldBe Scorable.NoSlug
+    }
 
-  it should "return None, given illegal JSON" in {
-    Scorable.jsonToMap("illegal{,json{{") should be (None)
-  }
+    "jsonToMap()" should "return a map, given a legal JSON string" in {
+      Scorable.jsonToMap(JsonString) should not be (None)
+    }
 
-  "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in { 
-    val score = Scorable.computeSimilarity(
-      new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
-    score shouldBe Scorable.MaxScore
-  }
+    it should "return None, given illegal JSON" in {
+      Scorable.jsonToMap("illegal{,json{{") should be (None)
+    }
 
-  /*
-  it should "return None if given a malformed json string" in {
-    val slug = Scorable.grobidToSlug(MalformedGrobidString)
-    slug shouldBe None
+    "computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
+      val score = Scorable.computeSimilarity(
+        new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+      score shouldBe Scorable.MaxScore
+    }
   }
 
-  it should "return None if given an empty json string" in {
-    val slug = Scorable.grobidToSlug("")
-    slug shouldBe None
-  }
+  def performPipelineTests() {
+      /*
 
-  "crossrefToSlug()" should "get the right slug for a crossref json string" in {
-    val slug = Scorable.crossrefToSlug(CrossrefStringWithTitle)
-    slug should contain ("sometitle")
-  }
+    val output = "/tmp/testOutput"
+    val input = "/tmp/testInput"
+    val (testTable, testHost) = ("test-table", "dummy-host:2181")
 
-  it should "return None if given json string without title" in {
-    val slug = Scorable.grobidToSlug(CrossrefStringWithoutTitle)
-    slug shouldBe None
-  }
+  val grobidSampleData = List(
+    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
+      Bytes.toBytes(MalformedGrobidString)))
 
-  it should "return None if given a malformed json string" in {
-    val slug = Scorable.grobidToSlug(MalformedCrossrefString)
-    slug shouldBe None
+  JobTest("sandcrawler.HBaseCrossrefScoreJob")
+    .arg("test", "")
+    .arg("app.conf.path", "app.conf")
+    .arg("output", output)
+    .arg("hbase-table", testTable)
+    .arg("zookeeper-hosts", testHost)
+    .arg("crossref-input", input)
+    .arg("debug", "true")
+    .source[Tuple](HBaseCrossrefScore.getHBaseSource(testTable, testHost),
+      grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+    .source(TextLine(input), List(
+      0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+      3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+    .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+    String, String, String, String)](output)) {
+      // Grobid titles: 
+      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+      // crossref slugs: 
+      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
+      outputBuffer =>
+      "The pipeline" should "return a 4-element list" in {
+        outputBuffer should have length 4
+      }
+    }
+    .run
+    .finish
+}
+       */
   }
- */
 }
author	Ellen Spertus <ellen.spertus@gmail.com>	2018-08-07 11:05:23 -0700
committer	Ellen Spertus <ellen.spertus@gmail.com>	2018-08-07 11:05:23 -0700
commit	8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch)
tree	f515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src
parent	408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff)
download	sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip