aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/Scorable.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala115
1 files changed, 115 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
new file mode 100644
index 0000000..8e0c560
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -0,0 +1,115 @@
+import scala.math
+import scala.util.parsing.json.JSON
+
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+case class MapFeatures(val key : String, slug : String, json : String)
+case class ReduceFeatures(json : String)
+case class ReduceOutput(val score : Int, json1 : String, json2 : String)
+
+abstract class Scorable {
+ def getInputPipe(args : Args) : TypedPipe[(String, ReduceFeatures)] =
+ {
+ getFeaturesPipe(args)
+ .filter { entry => Scorable.isValidSlug(entry.slug) }
+ .groupBy { case MapFeatures(key, slug, json) => slug }
+ .map { tuple =>
+ val (slug : String, features : MapFeatures) = tuple
+ (slug, ReduceFeatures(features.json))
+ }
+ }
+
+ // abstract method
+ def getFeaturesPipe(args : Args) : TypedPipe[MapFeatures]
+}
+
+object Scorable {
+ val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
+
+ def isValidSlug(slug : String) = {
+ slug != NoSlug
+ }
+
+ def jsonToMap(json : String) : Option[Map[String, Any]] = {
+ // https://stackoverflow.com/a/32717262/631051
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ Some(jsonObject.get.asInstanceOf[Map[String, Any]])
+ }
+ }
+
+ /*
+ def grobidToSlug(json : String) : Option[String] = {
+ jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ titleToSlug(getString(map, "title"))
+ } else {
+ None
+ }
+ }
+ }
+ }
+
+ def crossrefToSlug(json : String) : Option[String] = {
+ jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ // TODO: Stop ignoring secondary titles
+ titleToSlug(map("title").asInstanceOf[List[String]](0))
+ } else {
+ None
+ }
+ }
+ }
+ }
+ */
+
+ def titleToSlug(title : String) : String = {
+ val slug = StringUtilities.removeAccents(title).split(":")(0).toLowerCase()
+ if (slug.isEmpty) {
+ NoSlug
+ } else {
+ slug
+ }
+ }
+
+ def getStringOption(optionalMap : Option[Map[String, Any]], key : String)
+ : Option[String] = {
+ optionalMap match {
+ case None => None
+ case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
+ }
+ }
+
+ // Caller is responsible for ensuring that key is in map.
+ def getString(map : Map[String, String], key : String) : String = {
+ assert(map contains key)
+ map(key).asInstanceOf[String]
+ }
+
+ val MaxScore = 1000
+
+ def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) :
+ ReduceOutput = {
+ val json1 = jsonToMap(feature1.json)
+ val json2 = jsonToMap(feature2.json)
+ getStringOption(json1, "title") match {
+ case None => ReduceOutput(0, "No title", feature1.json)
+ case Some(title1) => {
+ getStringOption(json2, "title") match {
+ case None => ReduceOutput(0, "No title", feature2.json)
+ case Some(title2) =>
+ ReduceOutput(
+ (StringUtilities.similarity(title1, title2) * MaxScore).toInt,
+ feature1.json, feature2.json)
+ }
+ }
+ }
+ }
+}