aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 10:28:48 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 10:28:48 -0700
commit4981a98358aae098714d2266404f7b167993bf0c (patch)
treee045e4ee2eca6517f02750bb8e5a2e5f60fe277e /scalding
parentdddb7ed410bdd542ca12756d3e97aca6beea5532 (diff)
downloadsandcrawler-4981a98358aae098714d2266404f7b167993bf0c.tar.gz
sandcrawler-4981a98358aae098714d2266404f7b167993bf0c.zip
Minor refactoring. Added test.
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala15
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreJob.scala4
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala5
3 files changed, 12 insertions, 12 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 948002b..77bb7ae 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -69,19 +69,16 @@ object Scorable {
val MaxScore = 1000
- def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) :
- ReduceOutput = {
- val json1 = jsonToMap(feature1.json)
- val json2 = jsonToMap(feature2.json)
+ def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
getStringOption(json1, "title") match {
- case None => ReduceOutput(0, "No title", feature1.json)
+ case None => 0
case Some(title1) => {
getStringOption(json2, "title") match {
- case None => ReduceOutput(0, "No title", feature2.json)
+ case None => 0
case Some(title2) =>
- ReduceOutput(
- (StringUtilities.similarity(title1, title2) * MaxScore).toInt,
- feature1.json, feature2.json)
+ (StringUtilities.similarity(title1, title2) * MaxScore).toInt
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 22cc9e9..e6a5dc1 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -17,7 +17,9 @@ class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : Fl
pipe1.join(pipe2).map { entry =>
val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
- Scorable.computeOutput(features1, features2)
+ new ReduceOutput(Scorable.computeSimilarity(features1, features2),
+ features1.json,
+ features2.json)
}
.write(TypedTsv[ReduceOutput](args("output")))
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 535b8f6..9437fe6 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -77,8 +77,9 @@ class ScorableTest extends FlatSpec with Matchers {
}
"computeOutput()" should "return Scorable.MaxScore if given identical ReduceFeatures" in {
- val output = Scorable.computeOutput(new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
- output.score shouldBe Scorable.MaxScore
+ val score = Scorable.computeSimilarity(
+ new ReduceFeatures(JsonString), new ReduceFeatures(JsonString))
+ score shouldBe Scorable.MaxScore
}
/*