diff options
Diffstat (limited to 'scalding/src')
4 files changed, 14 insertions, 18 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index cf5849c..ee4cc54 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource class CrossrefScorable extends Scorable { - def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = { + def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = { TextLine(args("crossref-input")) .read .toTypedPipe[String](new Fields("line")) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index bf36855..95d6dae 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource class GrobidScorable extends Scorable with HBasePipeConversions { - def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = { + def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = { // TODO: Clean up code after debugging. val grobidSource = HBaseBuilder.build( args("hbase-table"), @@ -18,21 +18,18 @@ class GrobidScorable extends Scorable with HBasePipeConversions { List("grobid0:tei_json"), SourceMode.SCAN_ALL) -// val pipe0 : Pipe = grobidSource.read -// val grobidPipe : TypedPipe[MapFeatures] = pipe0 grobidSource.read - .fromBytesWritable(new Fields("key", "tei_json")) - // .debug // Should be 4 tuples for mocked data + .fromBytesWritable(new Fields("key", "tei_json")) // TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala) // didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json) - .toTypedPipe[(String, String)](new Fields("key", "tei_json")) - .map { entry => - val (key : String, json : String) = (entry._1, entry._2) - GrobidScorable.grobidToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) - case None => new MapFeatures(Scorable.NoSlug, json) + .toTypedPipe[(String, String)](new Fields("key", "tei_json")) + .map { entry => + val (key : String, json : String) = (entry._1, entry._2) + GrobidScorable.grobidToSlug(json) match { + case Some(slug) => new MapFeatures(slug, json) + case None => new MapFeatures(Scorable.NoSlug, json) + } } - } } } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index ce4fdca..86336cb 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -30,7 +30,7 @@ abstract class Scorable { object Scorable { val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable - def isValidSlug(slug : String) = { + def isValidSlug(slug : String) : Boolean = { slug != NoSlug } @@ -59,8 +59,7 @@ object Scorable { } } - def getStringOption(optionalMap : Option[Map[String, Any]], key : String) - : Option[String] = { + def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = { optionalMap match { case None => None case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None @@ -83,7 +82,7 @@ object Scorable { case Some(title1) => { getStringOption(json2, "title") match { case None => 0 - case Some(title2) => + case Some(title2) => (StringUtilities.similarity(title1, title2) * MaxScore).toInt } } diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 3058f15..b6e5554 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -32,7 +32,7 @@ object StringUtilities { // Adapted from: https://stackoverflow.com/a/16018452/631051 def similarity(s1a : String, s2a : String) : Double = { - val (s1, s2) = (removeAccents(removePunctuation(s1a)), + val (s1, s2) = (removeAccents(removePunctuation(s1a)), removeAccents(removePunctuation(s2a))) val longer : String = if (s1.length > s2.length) s1 else s2 val shorter : String = if (s1.length > s2.length) s2 else s1 |