diff options
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 15 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScoreJob.scala | 4 |
2 files changed, 9 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 948002b..77bb7ae 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -69,19 +69,16 @@ object Scorable { val MaxScore = 1000 - def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) : - ReduceOutput = { - val json1 = jsonToMap(feature1.json) - val json2 = jsonToMap(feature2.json) + def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = { + val json1 = jsonToMap(features1.json) + val json2 = jsonToMap(features2.json) getStringOption(json1, "title") match { - case None => ReduceOutput(0, "No title", feature1.json) + case None => 0 case Some(title1) => { getStringOption(json2, "title") match { - case None => ReduceOutput(0, "No title", feature2.json) + case None => 0 case Some(title2) => - ReduceOutput( - (StringUtilities.similarity(title1, title2) * MaxScore).toInt, - feature1.json, feature2.json) + (StringUtilities.similarity(title1, title2) * MaxScore).toInt } } } diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 22cc9e9..e6a5dc1 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -17,7 +17,9 @@ class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : Fl pipe1.join(pipe2).map { entry => val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry - Scorable.computeOutput(features1, features2) + new ReduceOutput(Scorable.computeSimilarity(features1, features2), + features1.json, + features2.json) } .write(TypedTsv[ReduceOutput](args("output"))) } |