Merge branch 'bnewbold-scoring-patches' into 'master'

Patches on top of scoring-refactor branch (Crossref/GROBID matching work) See merge request webgroup/sandcrawler!15
author: bnewbold <bnewbold@archive.org> 2018-08-16 21:09:54 +0000
committer: bnewbold <bnewbold@archive.org> 2018-08-16 21:09:54 +0000
commit: af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (patch)
tree: d32953c30c5f7342672694708b385936e5a36dfe /scalding/src/main
parent: 71be2e685848a31888811e2e398e769f7e0486c2 (diff)
parent: 96ea0ddd06ee4a7c11c7d5def976749ab3675878 (diff)
download: sandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.tar.gz
sandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.zip
6 files changed, 359 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
new file mode 100644
index 0000000..ff8201a
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -0,0 +1,47 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+class CrossrefScorable extends Scorable with HBasePipeConversions {
+  // TODO: Generalize args so there can be multiple Crossref pipes in one job.
+  def getSource(args : Args) : Source = {
+    TextLine(args("crossref-input"))
+  }
+
+  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+    getSource(args).read
+      .toTypedPipe[String](new Fields("line"))
+      .map { CrossrefScorable.jsonToMapFeatures(_) }
+  }
+}
+
+object CrossrefScorable {
+  def jsonToMapFeatures(json : String) : MapFeatures = {
+    Scorable.jsonToMap(json) match {
+      case None => MapFeatures(Scorable.NoSlug, json)
+      case Some(map) => {
+        if ((map contains "title") && (map contains "DOI")) {
+          val titles = map("title").asInstanceOf[List[String]]
+          val doi = Scorable.getString(map, "DOI")
+          if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
+            new MapFeatures(Scorable.NoSlug, json)
+          } else {
+            // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
+            val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
+            new MapFeatures(sf.toSlug, sf.toString)
+          }
+        } else {
+          new MapFeatures(Scorable.NoSlug, json)
+        }
+      }
+    }
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
new file mode 100644
index 0000000..9a09e05
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -0,0 +1,56 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+import parallelai.spyglass.hbase.HBasePipeConversions
+import parallelai.spyglass.hbase.HBaseSource
+
+class GrobidScorable extends Scorable with HBasePipeConversions {
+  val StatusOK = 200
+
+  def getSource(args : Args) : Source = {
+    // TODO: Generalize args so there can be multiple grobid pipes in one job.
+    GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
+  }
+
+  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+    getSource(args)
+      .read
+      // Can't just "fromBytesWritable" because we have multiple types?
+      .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "tei_json", "status_code"))
+      .filter { case (_, tei_json, status_code) => tei_json != null && status_code != null }
+      .map { case (key, tei_json, status_code) =>
+        (Bytes.toString(key.copyBytes()), Bytes.toString(tei_json.copyBytes()), Bytes.toLong(status_code.copyBytes()))
+      }
+      // TODO: Should I combine next two stages for efficiency?
+      .collect { case (key, json, StatusOK) => (key, json) }
+      .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
+  }
+}
+
+object GrobidScorable {
+  def getHBaseSource(table : String, host : String) : HBaseSource = {
+    HBaseBuilder.build(table, host, List("grobid0:tei_json", "grobid0:status_code"), SourceMode.SCAN_ALL)
+  }
+
+  def jsonToMapFeatures(key : String, json : String) : MapFeatures = {
+    Scorable.jsonToMap(json) match {
+      case None => MapFeatures(Scorable.NoSlug, json)
+      case Some(map) => {
+        if (map contains "title") {
+          new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures
+        } else {
+          MapFeatures(Scorable.NoSlug, json)
+        }
+      }
+    }
+  }
+}
+
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
new file mode 100644
index 0000000..9b9c633
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -0,0 +1,79 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+
+case class MapFeatures(slug : String, json : String)
+case class ReduceFeatures(json : String)
+case class ReduceOutput(val slug : String,  score : Int, json1 : String, json2 : String)
+
+abstract class Scorable {
+  def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] =
+  {
+    getFeaturesPipe(args)
+      .filter { entry => Scorable.isValidSlug(entry.slug) }
+      .groupBy { case MapFeatures(slug, json) => slug }
+      .map { tuple =>
+        val (slug : String, features : MapFeatures) = tuple
+        (slug, ReduceFeatures(features.json))
+      }
+  }
+
+  // abstract methods
+  def getSource(args : Args) : Source
+  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures]
+}
+
+object Scorable {
+  val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
+
+  def isValidSlug(slug : String) : Boolean = {
+    slug != NoSlug
+  }
+
+  def jsonToMap(json : String) : Option[Map[String, Any]] = {
+    // https://stackoverflow.com/a/32717262/631051
+    val jsonObject = JSON.parseFull(json)
+    if (jsonObject == None) {
+      None
+    } else {
+      Some(jsonObject.get.asInstanceOf[Map[String, Any]])
+    }
+  }
+
+  def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
+    optionalMap match {
+      case None => None
+      case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
+    }
+  }
+
+  // Caller is responsible for ensuring that key is a String in map.
+  // TODO: Add and handle ClassCastException
+  def getString(map : Map[String, Any], key : String) : String = {
+    assert(map contains key)
+    map(key).asInstanceOf[String]
+  }
+
+  val MaxScore = 1000
+
+  def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+    val json1 = jsonToMap(features1.json)
+    val json2 = jsonToMap(features2.json)
+    getStringOption(json1, "title") match {
+      case None => 0
+      case Some(title1) => {
+        getStringOption(json2, "title") match {
+          case None => 0
+          case Some(title2) =>
+            (StringUtilities.similarity(title1, title2) * MaxScore).toInt
+        }
+      }
+    }
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..8ed3369
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,44 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures).
+class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+
+  val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
+    "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
+    "casereport", "commentary", "commentaryon", "commenton", "commentto",
+    "contents", "correspondence", "dedication", "editorialadvisoryboard",
+    "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
+    "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
+    "references", "results", "review", "reviewarticle", "summary", "title",
+    "name")
+
+  def toMap() : Map[String, Any] = {
+    Map("title" -> (if (title == null) "" else title),
+        "year" -> year,
+        "doi" -> (if (doi == null) "" else doi),
+        "sha1" -> (if (sha1 == null) "" else sha1))
+  }
+
+  override def toString() : String = {
+    JSONObject(toMap()).toString
+  }
+
+  def toSlug() : String = {
+    if (title == null) {
+      Scorable.NoSlug
+    } else {
+      val unaccented = StringUtilities.removeAccents(title)
+      // Remove punctuation
+      val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
+      if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
+    }
+  }
+
+  def toMapFeatures = {
+    MapFeatures(toSlug, toString)
+  }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
new file mode 100644
index 0000000..75d45e9
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -0,0 +1,57 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class ScoreJob(args: Args) extends JobBase(args) {
+  // TODO: Instantiate any subclass of Scorable specified in args.
+  val sc1 : Scorable = new GrobidScorable()
+  val sc2 : Scorable = new CrossrefScorable()
+  val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)
+  val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args)
+
+  pipe1.join(pipe2).map { entry =>
+    val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+    new ReduceOutput(
+      slug,
+      Scorable.computeSimilarity(features1, features2),
+      features1.json,
+      features2.json)
+  }
+  //TypedTsv doesn't work over case classes.
+    .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+    .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
+
+/*
+// Ugly hack to get non-String information into ScoreJob above.
+object ScoreJob {
+  var scorable1 : Option[Scorable] = None
+  var scorable2 : Option[Scorable] = None
+
+  def setScorable1(s : Scorable) {
+    scorable1 = Some(s)
+  }
+
+  def getScorable1() : Scorable = {
+    scorable1  match {
+      case Some(s) => s
+      case None => null
+    }
+  }
+
+  def setScorable2(s: Scorable) {
+    scorable2 = Some(s)
+  }
+
+  def getScorable2() : Scorable = {
+    scorable2 match {
+      case Some(s) => s
+      case None => null
+    }
+  }
+}
+ */
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
new file mode 100644
index 0000000..2745875
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -0,0 +1,76 @@
+package sandcrawler
+
+import java.text.Normalizer
+import java.util.regex.Pattern
+
+object StringUtilities {
+  // bnewbold: I propose that we:
+  // 1. keep only \p{Ideographic}, \p{Alphabetic}, and \p{Digit}
+  // 2. strip accents
+  // 3. "lower-case" (unicode-aware)
+  // 4. do any final custom/manual mappings
+  //
+  // We should check (test) that null bytes are handled, in addition to other
+  // more obvious characters
+
+  // Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
+  def removeAccents(s : String) : String = {
+    val replacements = Map(
+      '\u0141' -> 'L',
+      '\u0142' -> 'l',  // Letter ell
+      '\u00d8' -> 'O',
+      '\u00f8' -> 'o'
+    )
+    val sb = new StringBuilder(Normalizer.normalize(s, Normalizer.Form.NFD))
+    for (i <- 0 to sb.length - 1) {
+      for (key <- replacements.keys) {
+        if (sb(i) == key) {
+          sb.deleteCharAt(i);
+          sb.insert(i, replacements(key))
+        }
+      }
+    }
+    val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
+    pattern.matcher(sb).replaceAll("")
+  }
+
+  // Source: https://stackoverflow.com/a/30076541/631051
+  def removePunctuation(s: String) : String = {
+    s.replaceAll("""[\p{Punct}]""", "")
+  }
+
+  // Adapted from: https://stackoverflow.com/a/16018452/631051
+  def similarity(s1a : String, s2a : String) : Double = {
+    val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+      removeAccents(removePunctuation(s2a)))
+    val longer : String = if (s1.length > s2.length) s1 else s2
+    val shorter : String = if (s1.length > s2.length) s2 else s1
+    if (longer.length == 0) {
+      // Both strings are empty.
+      1
+    } else {
+      (longer.length - stringDistance(longer, shorter)) / longer.length.toDouble
+    }
+  }
+
+  // Source: https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+  def stringDistance(s1: String, s2: String): Int = {
+    val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
+    def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
+    def sd(s1: List[Char], s2: List[Char]): Int = {
+      if (!memo.contains((s1, s2))) {
+        memo((s1,s2)) = (s1, s2) match {
+          case (_, Nil) => s1.length
+          case (Nil, _) => s2.length
+          case (c1::t1, c2::t2)  =>
+            min( sd(t1,s2) + 1, sd(s1,t2) + 1,
+              sd(t1,t2) + (if (c1==c2) 0 else 1) )
+        }
+      }
+      memo((s1,s2))
+    }
+
+    sd( s1.toList, s2.toList )
+  }
+}
+
author	bnewbold <bnewbold@archive.org>	2018-08-16 21:09:54 +0000
committer	bnewbold <bnewbold@archive.org>	2018-08-16 21:09:54 +0000
commit	af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (patch)
tree	d32953c30c5f7342672694708b385936e5a36dfe /scalding/src/main
parent	71be2e685848a31888811e2e398e769f7e0486c2 (diff)
parent	96ea0ddd06ee4a7c11c7d5def976749ab3675878 (diff)
download	sandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.tar.gz sandcrawler-af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6.zip