Computes and outputs (score, sha1, doi, grobidTitle, crossrefTitle).

author: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-26 04:36:43 -0700
committer: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-26 04:36:43 -0700
commit: 6d2bb4787150682236f4c349f8e469026fe3d490 (patch)
tree: 9c86515c4280c87c2f382d92213a8ef3cd8e18eb /scalding
parent: 15ae7006cd8238bb9453f27be6aa5388a6002ce8 (diff)
download: sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.tar.gz
sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.zip
2 files changed, 71 insertions, 33 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7e10c43..714af36 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -3,6 +3,7 @@ package sandcrawler
 import java.util.Arrays
 import java.util.Properties
 
+import scala.math
 import scala.util.parsing.json.JSON
 
 import cascading.tuple.Fields
@@ -17,11 +18,9 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
 
-class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
-    HBasePipeConversions {
+class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConversions {
   val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
 
-  /*
   // key is SHA1
   val grobidSource = HBaseCrossrefScore.getHBaseSource(
     args("hbase-table"),
@@ -29,13 +28,14 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
   val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
     .read
     .fromBytesWritable(new Fields("key", "tei_json"))
-    .debug  // Should be 4 tuples for mocked data
+    //  .debug  // Should be 4 tuples for mocked data
     .toTypedPipe[(String, String)]('key, 'tei_json)
     .map { entry =>
       val (key, json) = (entry._1, entry._2)
+      // TODO: Consider passing forward only a subset of JSON.
       HBaseCrossrefScore.grobidToSlug(json) match {
-          case Some(slug) => (slug, key, json)
-          case None => (NoTitle, key, json)
+        case Some(slug) => (slug, key, json)
+        case None => (NoTitle, key, json)
       }
     }
     .filter { entry =>
@@ -46,15 +46,12 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
 
   val grobidGroup = grobidPipe
     .groupBy { case (slug, key, json) => slug }
-   */
 
   val crossrefSource = TextLine(args("crossref-input"))
-  val crossrefPipe : TypedPipe[String] = crossrefSource
+  val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
     .read
-    .debug // Should be 4 tuples for mocked data
+    //    .debug // Should be 4 tuples for mocked data
     .toTypedPipe[String]('line)
-  /*
-    .map{line : String => (line, "foo")}
     .map{ json : String =>
       HBaseCrossrefScore.crossrefToSlug(json) match {
         case Some(slug) => (slug, json)
@@ -65,26 +62,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
       val (slug, json) = entry
       slug != NoTitle
     }
-   */
-    .write(TypedTsv[String](args("output")))
 
-
-  /*
   val crossrefGroup = crossrefPipe
   .groupBy { case (slug, json) => slug }
 
-  // TODO: Figure out which is smaller.
-  val theJoin : CoGrouped[String, ((String, String, String), (String, String))] = 
+  val theJoin : CoGrouped[String, ((String, String, String), (String, String))] =
     grobidGroup.join(crossrefGroup)
 
   theJoin.map{ entry =>
-        val (slug : String, 
-          ((slug0: String, sha1 : String, grobidJson : String), 
-            (slug1 : String, crossrefJson : String))) = entry
-        // TODO: For now, output it all.
-        (slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
-      .write(TypedTsv[(String, String, String, String, String, String)](args("output")))
-   */
+    val (slug : String,
+      ((slug0: String, sha1 : String, grobidJson : String),
+        (slug1 : String, crossrefJson : String))) = entry
+    HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
+    .debug
+  // Output: score, sha1, doi, grobid title, crossref title
+    .write(TypedTsv[(Int, String, String, String, String)](args("output")))
 
 }
 
@@ -137,4 +129,37 @@ object HBaseCrossrefScore {
       Some(slug)
     }
   }
+
+  val FullTitleMatch = 100
+  val TitleLeftMatchBase = 50
+  val MaxTitleLeftMatch = 80
+  val SlugMatch = 25
+
+  def computeSimilarity(gTitle : String, cTitle : String) : Int = {
+    assert(titleToSlug(gTitle) == titleToSlug(cTitle))
+    if (gTitle == cTitle) {
+      FullTitleMatch
+    } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) {
+      math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length),
+        MaxTitleLeftMatch)
+    } else {
+      SlugMatch
+    }
+  }
+
+  def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
+    // (score, sha1, doi, grobidTitle, crossrefTitle)
+      (Int, String, String, String, String) = {
+    // JSON has already been validated in previous stages.
+    val grobid = jsonToMap(grobidJson)
+    val crossref = jsonToMap(crossrefJson)
+
+    val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
+    val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
+    (computeSimilarity(grobidTitle, crossrefTitle),
+      sha1,
+      crossref("DOI").asInstanceOf[String],
+      "'" + grobidTitle + "'",
+      "'" + crossrefTitle + "'")
+  }
 }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index bd9dcd3..e6211a2 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -163,10 +163,14 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
   val (testTable, testHost) = ("test-table", "dummy-host:2181")
 
   val grobidSampleData = List(
-    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title1"))),
-    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title2: TNG"))),
-    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title3: The Sequel"))),
-    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), Bytes.toBytes(MalformedGrobidString)))
+    List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+    List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+    List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+      Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+    List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), 
+      Bytes.toBytes(MalformedGrobidString)))
 
   JobTest("sandcrawler.HBaseCrossrefScoreJob")
     .arg("test", "")
@@ -180,18 +184,27 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
       grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
     .source(TextLine(input), List(
       0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
-      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.5"),
-      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0.75"),
+      1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+      2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
       3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
-    .sink[String](TypedTsv[String](output)) {
+    .sink[(Int, String, String, String, String)](TypedTsv[(Int,
+    String, String, String, String)](output)) {
+      // Grobid titles: 
+      //   "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+      // crossref slugs: 
+      //   "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+      // Join should have 3 "Title  1" slugs and 1 "Title 2" slug
       outputBuffer =>
       it should "return a 4-element list" in {
-        outputBuffer should have length 3
+        outputBuffer should have length 4
       }
+
       /*
       it should "return the right first entry" in {
         val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
-        slug shouldBe "title1"
+        slug shouldBe "title 1"
+        slug shouldBe slug0
+        slug shouldBe slug1
         sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
         grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
       }
author	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-26 04:36:43 -0700
committer	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-26 04:36:43 -0700
commit	6d2bb4787150682236f4c349f8e469026fe3d490 (patch)
tree	9c86515c4280c87c2f382d92213a8ef3cd8e18eb /scalding
parent	15ae7006cd8238bb9453f27be6aa5388a6002ce8 (diff)
download	sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.tar.gz sandcrawler-6d2bb4787150682236f4c349f8e469026fe3d490.zip