aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala21
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala7
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala2
4 files changed, 14 insertions, 18 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index cf5849c..ee4cc54 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
class CrossrefScorable extends Scorable {
- def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = {
+ def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {
TextLine(args("crossref-input"))
.read
.toTypedPipe[String](new Fields("line"))
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index bf36855..95d6dae 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -10,7 +10,7 @@ import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
class GrobidScorable extends Scorable with HBasePipeConversions {
- def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) = {
+ def getFeaturesPipe(args : Args)(implicit flowDef : FlowDef, mode : Mode) : TypedPipe[MapFeatures] = {
// TODO: Clean up code after debugging.
val grobidSource = HBaseBuilder.build(
args("hbase-table"),
@@ -18,21 +18,18 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
List("grobid0:tei_json"),
SourceMode.SCAN_ALL)
-// val pipe0 : Pipe = grobidSource.read
-// val grobidPipe : TypedPipe[MapFeatures] = pipe0
grobidSource.read
- .fromBytesWritable(new Fields("key", "tei_json"))
- // .debug // Should be 4 tuples for mocked data
+ .fromBytesWritable(new Fields("key", "tei_json"))
// TODO: Figure out why this line (used in HBaseCrossrefScoreJob.scala)
// didn't work here: .toTypedPipe[(String, String)]('key, 'tei_json)
- .toTypedPipe[(String, String)](new Fields("key", "tei_json"))
- .map { entry =>
- val (key : String, json : String) = (entry._1, entry._2)
- GrobidScorable.grobidToSlug(json) match {
- case Some(slug) => new MapFeatures(slug, json)
- case None => new MapFeatures(Scorable.NoSlug, json)
+ .toTypedPipe[(String, String)](new Fields("key", "tei_json"))
+ .map { entry =>
+ val (key : String, json : String) = (entry._1, entry._2)
+ GrobidScorable.grobidToSlug(json) match {
+ case Some(slug) => new MapFeatures(slug, json)
+ case None => new MapFeatures(Scorable.NoSlug, json)
+ }
}
- }
}
}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index ce4fdca..86336cb 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -30,7 +30,7 @@ abstract class Scorable {
object Scorable {
val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
- def isValidSlug(slug : String) = {
+ def isValidSlug(slug : String) : Boolean = {
slug != NoSlug
}
@@ -59,8 +59,7 @@ object Scorable {
}
}
- def getStringOption(optionalMap : Option[Map[String, Any]], key : String)
- : Option[String] = {
+ def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
optionalMap match {
case None => None
case Some(map) => if (map contains key) Some(map(key).asInstanceOf[String]) else None
@@ -83,7 +82,7 @@ object Scorable {
case Some(title1) => {
getStringOption(json2, "title") match {
case None => 0
- case Some(title2) =>
+ case Some(title2) =>
(StringUtilities.similarity(title1, title2) * MaxScore).toInt
}
}
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 3058f15..b6e5554 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -32,7 +32,7 @@ object StringUtilities {
// Adapted from: https://stackoverflow.com/a/16018452/631051
def similarity(s1a : String, s2a : String) : Double = {
- val (s1, s2) = (removeAccents(removePunctuation(s1a)),
+ val (s1, s2) = (removeAccents(removePunctuation(s1a)),
removeAccents(removePunctuation(s2a)))
val longer : String = if (s1.length > s2.length) s1 else s2
val shorter : String = if (s1.length > s2.length) s2 else s1