aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/Scorable.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala19
1 files changed, 19 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 5d67044..d9c38e8 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -60,6 +60,25 @@ object Scorable {
val MaxScore = 1000
+ def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+
+ (
+ getStringOption(json1, "fatcat_release") != None &&
+ getStringOption(json2, "fatcat_release") != None &&
+ getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") &&
+ (getStringOption(json1, "fatcat_work") match {
+ case None => false
+ case Some(work1) => getStringOption(json2, "fatcat_work") match {
+ case None => false
+ // this last check ensures we don't double-match
+ case Some(work2) => work1 > work2
+ }
+ })
+ )
+ }
+
def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
val json1 = jsonToMap(features1.json)
val json2 = jsonToMap(features2.json)