Restored my old tests. Commented out broken tests.

author: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-25 20:05:28 -0700
committer: Ellen Spertus <ellen.spertus@gmail.com> 2018-07-25 20:05:28 -0700
commit: 148b724e65d56115c57bf456c92fa03ef028cd38 (patch)
tree: 1ce63b0597f77dcff4b444b3f46088eb5d3bc316 /scalding/src/main
parent: 980c4af4fbc9d0c62fc75396f2237e5c58863ebf (diff)
download: sandcrawler-148b724e65d56115c57bf456c92fa03ef028cd38.tar.gz
sandcrawler-148b724e65d56115c57bf456c92fa03ef028cd38.zip
1 files changed, 44 insertions, 21 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 7b7deec..ac633e4 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -7,6 +7,8 @@ import scala.util.parsing.json.JSON
 
 import cascading.tuple.Fields
 import com.twitter.scalding._
+import com.twitter.scalding.typed.CoGrouped
+import com.twitter.scalding.typed.Grouped
 import com.twitter.scalding.typed.TDsl._
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.util.Bytes
@@ -15,6 +17,7 @@ import parallelai.spyglass.hbase.HBaseConstants.SourceMode
 import parallelai.spyglass.hbase.HBasePipeConversions
 import parallelai.spyglass.hbase.HBaseSource
 
+
 class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
     HBasePipeConversions {
   val NoTitle = "NO TITLE" // Used for slug if title is empty or unparseable
@@ -26,36 +29,56 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with
   val grobidPipe : TypedPipe[(String, String, String)] = grobidSource
     .read
     .fromBytesWritable(new Fields("key", "tei_json"))
-    .debug
     .toTypedPipe[(String, String)]('key, 'tei_json)
     .map { entry =>
       val (key, json) = (entry._1, entry._2)
       HBaseCrossrefScore.grobidToSlug(json) match {
-          case Some(slug) => (key, json, slug)
-          case None => (key, json, NoTitle)
+          case Some(slug) => (slug, key, json)
+          case None => (NoTitle, key, json)
       }
     }
     .filter { entry =>
-      val (_, _, slug) = entry
-      slug != NoTitle && slug.length > 0
+      val (slug, _, _) = entry
+      slug != NoTitle
     }
-    .write(TypedTsv[(String, String, String)](args("output")))
 
-/*
-    .map('key -> 'sha1) { sha1 : String => sha1 }
+  val grobidGroup = grobidPipe
+    .groupBy { case (slug, key, json) => slug }
+//    .debug
+
+
   val crossrefSource = TextLine(args("crossref-input"))
-  val crossrefPipe = crossrefSource
+  val crossrefPipe : TypedPipe[(String, String)] = crossrefSource
     .read
-    .map('line -> 'slug) {
-      json : String => HBaseCrossrefScore.crossrefToSlug(json)}
-    .debug
-
-  val innerJoinPipe = grobidPipe.joinWithSmaller('slug -> 'slug, crossrefPipe)
-  innerJoinPipe
-    .mapTo(('tei_json, 'line, 'sha1) -> ('sha1, 'doi, 'score)) {
-      x : (String, String, String) => HBaseCrossrefScore.performJoin(x._1, x._2, x._3)}
-    .write(TypedTsv[(String, String, String)](args("output")))
- */
+    .toTypedPipe[String]('line)
+    .map{ json : String =>
+//      val (offset, json) = entry
+      HBaseCrossrefScore.crossrefToSlug(json) match {
+        case Some(slug) => (slug, json)
+        case None => (NoTitle, json)
+      }
+    }
+  .debug
+    .filter { entry =>
+      val (slug, json) = entry
+      slug != NoTitle
+    }
+  val crossrefGroup = crossrefPipe
+  .groupBy { case (slug, json) => slug }
+
+  // TODO: Figure out which is smaller.
+  val theJoin : CoGrouped[String, ((String, String, String), (String, String))] = 
+    grobidGroup.join(crossrefGroup)
+
+  theJoin.map{ entry =>
+        val (slug : String, 
+          ((slug0: String, sha1 : String, grobidJson : String), 
+            (slug1 : String, crossrefJson : String))) = entry
+        // TODO: For now, output it all.
+        (slug, slug0, slug1, sha1, grobidJson, crossrefJson)}
+      .write(TypedTsv[(String, String, String, String, String, String)](args("output")))
+
+
 }
 
 object HBaseCrossrefScore {
@@ -74,7 +97,7 @@ object HBaseCrossrefScore {
     val jsonObject = JSON.parseFull(json)
     if (jsonObject == None) {
       // Empty map for malformed JSON
-      Map[String, Any]("foo" -> json)
+      Map[String, Any]("malformed json" -> json)
     } else {
       jsonObject.get.asInstanceOf[Map[String, Any]]
     }
@@ -95,7 +118,7 @@ object HBaseCrossrefScore {
       // TODO: Don't ignore titles after the first.
       titleToSlug(map("title").asInstanceOf[List[String]](0))
     } else {
-      None
+      Some(map.keys.mkString(","))
     }
   }
author	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-25 20:05:28 -0700
committer	Ellen Spertus <ellen.spertus@gmail.com>	2018-07-25 20:05:28 -0700
commit	148b724e65d56115c57bf456c92fa03ef028cd38 (patch)
tree	1ce63b0597f77dcff4b444b3f46088eb5d3bc316 /scalding/src/main
parent	980c4af4fbc9d0c62fc75396f2237e5c58863ebf (diff)
download	sandcrawler-148b724e65d56115c57bf456c92fa03ef028cd38.tar.gz sandcrawler-148b724e65d56115c57bf456c92fa03ef028cd38.zip