diff options
Diffstat (limited to 'scalding/src/main/scala')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 15 | ||||
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/ScoreJob.scala | 4 | 
2 files changed, 9 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 948002b..77bb7ae 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -69,19 +69,16 @@ object Scorable {    val MaxScore = 1000 -  def computeOutput(feature1 : ReduceFeatures, feature2 : ReduceFeatures) : -      ReduceOutput = { -    val json1 = jsonToMap(feature1.json) -    val json2 = jsonToMap(feature2.json) +  def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = { +    val json1 = jsonToMap(features1.json) +    val json2 = jsonToMap(features2.json)      getStringOption(json1, "title") match { -      case None => ReduceOutput(0, "No title", feature1.json) +      case None => 0        case Some(title1) => {          getStringOption(json2, "title") match { -          case None => ReduceOutput(0, "No title", feature2.json) +          case None => 0            case Some(title2) =>  -            ReduceOutput( -              (StringUtilities.similarity(title1, title2) * MaxScore).toInt, -              feature1.json, feature2.json) +            (StringUtilities.similarity(title1, title2) * MaxScore).toInt          }        }      } diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 22cc9e9..e6a5dc1 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -17,7 +17,9 @@ class ScoreJob(args: Args, sc1 : Scorable, sc2 : Scorable)(implicit flowDef : Fl    pipe1.join(pipe2).map { entry =>      val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry -    Scorable.computeOutput(features1, features2) +    new ReduceOutput(Scorable.computeSimilarity(features1, features2), +      features1.json, +      features2.json)    }      .write(TypedTsv[ReduceOutput](args("output")))  }  | 
