diff options
Diffstat (limited to 'scalding/src/main/scala')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/Scorable.scala | 19 | ||||
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 8 | 
2 files changed, 24 insertions, 3 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 5d67044..d9c38e8 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -60,6 +60,25 @@ object Scorable {    val MaxScore = 1000 +  def selfMatchable(features1 : ReduceFeatures, features2 : ReduceFeatures) : Boolean = { +    val json1 = jsonToMap(features1.json) +    val json2 = jsonToMap(features2.json) + +    ( +      getStringOption(json1, "fatcat_release") != None && +      getStringOption(json2, "fatcat_release") != None && +      getStringOption(json1, "fatcat_release") != getStringOption(json2, "fatcat_release") && +      (getStringOption(json1, "fatcat_work") match { +        case None => false +        case Some(work1) => getStringOption(json2, "fatcat_work") match { +          case None => false +          // this last check ensures we don't double-match +          case Some(work2) => work1 > work2 +        } +      }) +    ) +  } +    def computeSimilarity(features1 : ReduceFeatures, features2 : ReduceFeatures) : Int = {      val json1 = jsonToMap(features1.json)      val json2 = jsonToMap(features2.json) diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 95a39aa..93cd78d 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -14,22 +14,24 @@ object ScorableFeatures {    val MinSlugLength = 8    // Static factory method -  def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { +  def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = {      new ScorableFeatures(        title=if (title == null) "" else title,        authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),        year=year,        doi=if (doi == null) "" else doi, +      fatcat_release=if (fatcat_release == null) "" else fatcat_release, +      fatcat_work=if (fatcat_work == null) "" else fatcat_work,        sha1=if (sha1 == null) "" else sha1)    }  }  // Contains features needed to make slug and to score (in combination  // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") {    def toMap() : Map[String, Any] = -    Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) +    Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1)    override def toString() : String = {      JSONObject(toMap).toString | 
