2 files changed, 79 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
new file mode 100644
index 0000000..cdd598f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
@@ -0,0 +1,50 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+// XXX: import parallelai.spyglass.hbase.HBasePipeConversions
+
+// XXX: class BibjsonScorable extends Scorable with HBasePipeConversions {
+
+class BibjsonScorable extends Scorable {
+
+  def getSource(args : Args) : Source = {
+    TextLine(args("bibjson-input"))
+  }
+
+  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+    getSource(args).read
+      .toTypedPipe[String](new Fields("line"))
+      .map { BibjsonScorable.bibjsonToMapFeatures(_) }
+  }
+}
+
+object BibjsonScorable {
+  def bibjsonToMapFeatures(json : String) : MapFeatures = {
+    Scorable.jsonToMap(json) match {
+      case None => MapFeatures(Scorable.NoSlug, json)
+      case Some(map) => {
+        if (map contains "title") {
+          val title = Scorable.getString(map, "title")
+          val doi = Scorable.getString(map, "doi")
+          val sha1 = Scorable.getString(map, "sha")
+          // TODO: year, authors (if available)
+          if (title == null || title.isEmpty) {
+            new MapFeatures(Scorable.NoSlug, json)
+          } else {
+            val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
+            new MapFeatures(sf.toSlug, sf.toString)
+          }
+        } else {
+          new MapFeatures(Scorable.NoSlug, json)
+        }
+      }
+    }
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
new file mode 100644
index 0000000..1578258
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
@@ -0,0 +1,29 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class MatchBenchmarkJob(args: Args) extends JobBase(args) {
+  // TODO: Instantiate any subclass of Scorable specified in args.
+  val sc1 : Scorable = new BibjsonScorable()
+  val sc2 : Scorable = new BibjsonScorable()
+  val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson")))
+  val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson")))
+  val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
+  val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
+
+  pipe1.join(pipe2).map { entry =>
+    val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+    new ReduceOutput(
+      slug,
+      Scorable.computeSimilarity(features1, features2),
+      features1.json,
+      features2.json)
+  }
+  //TypedTsv doesn't work over case classes.
+    .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+    .write(TypedTsv[(String, Int, String, String)](args("output")))
+}