diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-02 17:11:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-10 19:50:21 -0700 |
commit | ca725ffd9efe847905afb918ff324b421a4d8859 (patch) | |
tree | 4e5e5a7a5df273e9be2e14ecde3f4b0c2d3b998b /scalding/src/main/scala/sandcrawler/Scorable.scala | |
parent | ff60cb2411082b2e5ea4e09875006824632b81a2 (diff) | |
download | sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.tar.gz sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.zip |
add fatcat ident fields in prep for self-scoring job
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 5d67044..d9c38e8 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -60,6 +60,25 @@ object Scorable { val MaxScore = 1000 + def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = { + val json1 = jsonToMap(features1.json) + val json2 = jsonToMap(features2.json) + + ( + getStringOption(json1, "fatcat_release") != None && + getStringOption(json2, "fatcat_release") != None && + getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") && + (getStringOption(json1, "fatcat_work") match { + case None => false + case Some(work1) => getStringOption(json2, "fatcat_work") match { + case None => false + // this last check ensures we don't double-match + case Some(work2) => work1 > work2 + } + }) + ) + } + def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = { val json1 = jsonToMap(features1.json) val json2 = jsonToMap(features2.json) |