aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-02 17:11:31 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-10 19:50:21 -0700
commitca725ffd9efe847905afb918ff324b421a4d8859 (patch)
tree4e5e5a7a5df273e9be2e14ecde3f4b0c2d3b998b /scalding
parentff60cb2411082b2e5ea4e09875006824632b81a2 (diff)
downloadsandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.tar.gz
sandcrawler-ca725ffd9efe847905afb918ff324b421a4d8859.zip
add fatcat ident fields in prep for self-scoring job
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala19
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala8
2 files changed, 24 insertions, 3 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 5d67044..d9c38e8 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -60,6 +60,25 @@ object Scorable {
val MaxScore = 1000
+ def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = {
+ val json1 = jsonToMap(features1.json)
+ val json2 = jsonToMap(features2.json)
+
+ (
+ getStringOption(json1, "fatcat_release") != None &&
+ getStringOption(json2, "fatcat_release") != None &&
+ getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") &&
+ (getStringOption(json1, "fatcat_work") match {
+ case None => false
+ case Some(work1) => getStringOption(json2, "fatcat_work") match {
+ case None => false
+ // this last check ensures we don't double-match
+ case Some(work2) => work1 > work2
+ }
+ })
+ )
+ }
+
def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {
val json1 = jsonToMap(features1.json)
val json2 = jsonToMap(features2.json)
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 95a39aa..93cd78d 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -14,22 +14,24 @@ object ScorableFeatures {
val MinSlugLength = 8
// Static factory method
- def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = {
new ScorableFeatures(
title=if (title == null) "" else title,
authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
year=year,
doi=if (doi == null) "" else doi,
+ fatcat_release=if (fatcat_release == null) "" else fatcat_release,
+ fatcat_work=if (fatcat_work == null) "" else fatcat_work,
sha1=if (sha1 == null) "" else sha1)
}
}
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
-class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") {
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") {
def toMap() : Map[String, Any] =
- Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1)
override def toString() : String = {
JSONObject(toMap).toString