diff options
Diffstat (limited to 'scalding')
4 files changed, 14 insertions, 18 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index cf5849c..ee4cc54 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions  import parallelai.spyglass.hbase.HBaseSource  class CrossrefScorable extends Scorable { -  def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = { +  def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {      TextLine(args("crossref-input"))        .read        .toTypedPipe[String](new Fields("line")) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index bf36855..95d6dae 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions  import parallelai.spyglass.hbase.HBaseSource  class GrobidScorable extends Scorable with HBasePipeConversions { -  def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = { +  def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {      // TODO: Clean up code after debugging.      val grobidSource = HBaseBuilder.build(        args("hbase-table"), @@ -18,21 +18,18 @@ class GrobidScorable extends Scorable with HBasePipeConversions {        List("grobid0:tei_json"),        SourceMode.SCAN_ALL) -//    val pipe0 : Pipe = grobidSource.read -//    val grobidPipe : TypedPipe[MapFeatures] = pipe0      grobidSource.read -    .fromBytesWritable(new Fields("key", "tei_json")) -    //  .debug  // Should be 4 tuples for mocked data +      .fromBytesWritable(new Fields("key", "tei_json"))      // TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala)      // didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json) -    .toTypedPipe[(String, String)](new Fields("key", "tei_json")) -    .map { entry => -      val (key : String, json : String) = (entry._1, entry._2) -      GrobidScorable.grobidToSlug(json) match { -        case Some(slug) => new MapFeatures(slug, json) -        case None => new MapFeatures(Scorable.NoSlug, json) +      .toTypedPipe[(String, String)](new Fields("key", "tei_json")) +      .map { entry => +        val (key : String, json : String) = (entry._1, entry._2) +        GrobidScorable.grobidToSlug(json) match { +          case Some(slug) => new MapFeatures(slug, json) +          case None => new MapFeatures(Scorable.NoSlug, json) +        }        } -    }    }  } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index ce4fdca..86336cb 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -30,7 +30,7 @@ abstract class Scorable {  object Scorable {    val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable -  def isValidSlug(slug : String) = { +  def isValidSlug(slug : String) : Boolean = {      slug != NoSlug    } @@ -59,8 +59,7 @@ object Scorable {      }    } -  def getStringOption(optionalMap : Option[Map[String, Any]], key : String)  -      : Option[String] = { +  def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {      optionalMap match {        case None => None        case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None @@ -83,7 +82,7 @@ object Scorable {        case Some(title1) => {          getStringOption(json2, "title") match {            case None => 0 -          case Some(title2) =>  +          case Some(title2) =>              (StringUtilities.similarity(title1, title2) * MaxScore).toInt          }        } diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 3058f15..b6e5554 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -32,7 +32,7 @@ object StringUtilities {    // Adapted from: https://stackoverflow.com/a/16018452/631051    def similarity(s1a : String, s2a : String) : Double = { -    val (s1, s2) = (removeAccents(removePunctuation(s1a)),  +    val (s1, s2) = (removeAccents(removePunctuation(s1a)),        removeAccents(removePunctuation(s2a)))      val longer : String = if (s1.length > s2.length) s1 else s2      val shorter : String = if (s1.length > s2.length) s2 else s1 | 
