diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-02 17:11:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-10 19:50:21 -0700 |
commit | ca725ffd9efe847905afb918ff324b421a4d8859 (patch) | |
tree | 4e5e5a7a5df273e9be2e14ecde3f4b0c2d3b998b | |
parent | ff60cb2411082b2e5ea4e09875006824632b81a2 (diff) | |
download | sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.tar.gz sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.zip |
add fatcat ident fields in prep for self-scoring job
-rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 19 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 8 |
2 files changed, 24 insertions, 3 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 5d67044..d9c38e8 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -60,6 +60,25 @@ object Scorable { val MaxScore = 1000 + def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = { + val json1 = jsonToMap(features1.json) + val json2 = jsonToMap(features2.json) + + ( + getStringOption(json1, "fatcat_release") != None && + getStringOption(json2, "fatcat_release") != None && + getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") && + (getStringOption(json1, "fatcat_work") match { + case None => false + case Some(work1) => getStringOption(json2, "fatcat_work") match { + case None => false + // this last check ensures we don't double-match + case Some(work2) => work1 > work2 + } + }) + ) + } + def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = { val json1 = jsonToMap(features1.json) val json2 = jsonToMap(features2.json) diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 95a39aa..93cd78d 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -14,22 +14,24 @@ object ScorableFeatures { val MinSlugLength = 8 // Static factory method - def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( title=if (title == null) "" else title, authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a), year=year, doi=if (doi == null) "" else doi, + fatcat_release=if (fatcat_release == null) "" else fatcat_release, + fatcat_work=if (fatcat_work == null) "" else fatcat_work, sha1=if (sha1 == null) "" else sha1) } } // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") { def toMap() : Map[String, Any] = - Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1) override def toString() : String = { JSONObject(toMap).toString |