aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/Scorable.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala79
1 files changed, 79 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
new file mode 100644
index 0000000..9b9c633
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -0,0 +1,79 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+case class MapFeatures(slug : String, json : String)
+case class ReduceFeatures(json : String)
+case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String)
+
+abstract class Scorable {
+ def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] =
+ {
+ getFeaturesPipe(args)
+ .filter { entry => Scorable.isValidSlug(entry.slug) }
+ .groupBy { case MapFeatures(slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+ }
+
+ // abstract methods
+ def getSource(args : Args) : Source
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures]
+}
+
+object Scorable {
+ val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
+
+ def isValidSlug(slug : String) : Boolean = {
+ slug != NoSlug
+ }
+
+ def jsonToMap(json : String) : Option[Map[String, Any]] = {
+ // https://stackoverflow.com/a/32717262/631051
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ Some(jsonObject.get.asInstanceOf[Map[String, Any]])
+ }
+ }
+
+ def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
+ optionalMap match {
+ case None => None
+ case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
+ }
+ }
+
+ // Caller is responsible for ensuring that key is a String in map.
+ // TODO: Add and handle ClassCastException
+ def getString(map : Map[String, Any], key : String) : String = {
+ assert(map contains key)
+ map(key).asInstanceOf[String]
+ }
+
+ val MaxScore = 1000
+
+ def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+ getStringOption(json1, "title") match {
+ case None => 0
+ case Some(title1) => {
+ getStringOption(json2, "title") match {
+ case None => 0
+ case Some(title2) =>
+ (StringUtilities.similarity(title1, title2) * MaxScore).toInt
+ }
+ }
+ }
+ }
+}