diff options
Diffstat (limited to 'scalding/src/main/scala')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 71 | 
1 files changed, 41 insertions, 30 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 714af36..c47ea3c 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -87,37 +87,40 @@ object HBaseCrossrefScore {      List("grobid0:tei_json"),      SourceMode.SCAN_ALL) -  def performJoin(grobidJson : String, crossRefJson : String, sha1 : String) : (String, String, String) = { -    (sha1, "1.2.3.4", "100") -  } - -  def jsonToMap(json : String) : Map[String, Any] = { +  def jsonToMap(json : String) : Option[Map[String, Any]] = {      // https://stackoverflow.com/a/32717262/631051      val jsonObject = JSON.parseFull(json)      if (jsonObject == None) { -      // Empty map for malformed JSON -      Map[String, Any]("malformed json" -> json) +      None      } else { -      jsonObject.get.asInstanceOf[Map[String, Any]] +      Some(jsonObject.get.asInstanceOf[Map[String, Any]])      }    }    def grobidToSlug(json : String) : Option[String] = { -    val map = jsonToMap(json) -    if (map contains "title") { -      titleToSlug(map("title").asInstanceOf[String]) -    } else { -      None +    jsonToMap(json) match { +      case None => None +      case Some(map) => { +        if (map contains "title") { +          titleToSlug(map("title").asInstanceOf[String]) +        } else { +          None +        } +      }      }    }    def crossrefToSlug(json : String) : Option[String] = { -    val map = jsonToMap(json) -    if (map contains "title") { -      // TODO: Don't ignore titles after the first. -      titleToSlug(map("title").asInstanceOf[List[String]](0)) -    } else { -      Some(map.keys.mkString(",")) +    jsonToMap(json) match { +      case None => None +      case Some(map) => { +        if (map contains "title") { +          // TODO: Don't ignore titles after the first. +          titleToSlug(map("title").asInstanceOf[List[String]](0)) +        } else { +          None +        } +      }      }    } @@ -150,16 +153,24 @@ object HBaseCrossrefScore {    def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :      // (score, sha1, doi, grobidTitle, crossrefTitle)        (Int, String, String, String, String) = { -    // JSON has already been validated in previous stages. -    val grobid = jsonToMap(grobidJson) -    val crossref = jsonToMap(crossrefJson) - -    val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase() -    val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase() -    (computeSimilarity(grobidTitle, crossrefTitle), -      sha1, -      crossref("DOI").asInstanceOf[String], -      "'" + grobidTitle + "'", -      "'" + crossrefTitle + "'") +    jsonToMap(grobidJson) match { +      case None => (0, "", "", "", "")  // This can't happen, because grobidJson already validated in earlier stage +      case Some(grobid) => { +        val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase() + +        jsonToMap(crossrefJson) match { +          case None => (0, "", "", "", "")  // This can't happen, because crossrefJson already validated in earlier stage +          case Some(crossref) => { +            val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase() + +            (computeSimilarity(grobidTitle, crossrefTitle), +              sha1, +              crossref("DOI").asInstanceOf[String], +              "'" + grobidTitle + "'", +              "'" + crossrefTitle + "'") +          } +        } +      } +    }    }  } | 
