aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/Scorable.scala
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-02 17:11:31 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-10 19:50:21 -0700
commitca725ffd9efe847905afb918ff324b421a4d8859 (patch)
tree4e5e5a7a5df273e9be2e14ecde3f4b0c2d3b998b /scalding/src/main/scala/sandcrawler/Scorable.scala
parentff60cb2411082b2e5ea4e09875006824632b81a2 (diff)
downloadsandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.tar.gz
sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.zip
add fatcat ident fields in prep for self-scoring job
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala19
1 files changed, 19 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 5d67044..d9c38e8 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -60,6 +60,25 @@ object Scorable {
val MaxScore = 1000
+ def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+
+ (
+ getStringOption(json1, "fatcat_release") != None &&
+ getStringOption(json2, "fatcat_release") != None &&
+ getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") &&
+ (getStringOption(json1, "fatcat_work") match {
+ case None => false
+ case Some(work1) => getStringOption(json2, "fatcat_work") match {
+ case None => false
+ // this last check ensures we don't double-match
+ case Some(work2) => work1 > work2
+ }
+ })
+ )
+ }
+
def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
val json1 = jsonToMap(features1.json)
val json2 = jsonToMap(features2.json)