aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-26 15:26:48 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-26 15:26:48 -0700
commit8c70cdb1f0387233d5f3eeef8a91ebdeaccac04f (patch)
tree90b827342ff7bbf27732905d1cbf09353bc051f0 /scalding/src/main/scala
parent6d2bb4787150682236f4c349f8e469026fe3d490 (diff)
downloadsandcrawler-8c70cdb1f0387233d5f3eeef8a91ebdeaccac04f.tar.gz
sandcrawler-8c70cdb1f0387233d5f3eeef8a91ebdeaccac04f.zip
Made changes suggested in MR.
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala71
1 files changed, 41 insertions, 30 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 714af36..c47ea3c 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -87,37 +87,40 @@ object HBaseCrossrefScore {
List("grobid0:tei_json"),
SourceMode.SCAN_ALL)
- def performJoin(grobidJson : String, crossRefJson : String, sha1 : String) : (String, String, String) = {
- (sha1, "1.2.3.4", "100")
- }
-
- def jsonToMap(json : String) : Map[String, Any] = {
+ def jsonToMap(json : String) : Option[Map[String, Any]] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
if (jsonObject == None) {
- // Empty map for malformed JSON
- Map[String, Any]("malformed json" -> json)
+ None
} else {
- jsonObject.get.asInstanceOf[Map[String, Any]]
+ Some(jsonObject.get.asInstanceOf[Map[String, Any]])
}
}
def grobidToSlug(json : String) : Option[String] = {
- val map = jsonToMap(json)
- if (map contains "title") {
- titleToSlug(map("title").asInstanceOf[String])
- } else {
- None
+ jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ titleToSlug(map("title").asInstanceOf[String])
+ } else {
+ None
+ }
+ }
}
}
def crossrefToSlug(json : String) : Option[String] = {
- val map = jsonToMap(json)
- if (map contains "title") {
- // TODO: Don't ignore titles after the first.
- titleToSlug(map("title").asInstanceOf[List[String]](0))
- } else {
- Some(map.keys.mkString(","))
+ jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ // TODO: Don't ignore titles after the first.
+ titleToSlug(map("title").asInstanceOf[List[String]](0))
+ } else {
+ None
+ }
+ }
}
}
@@ -150,16 +153,24 @@ object HBaseCrossrefScore {
def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
// (score, sha1, doi, grobidTitle, crossrefTitle)
(Int, String, String, String, String) = {
- // JSON has already been validated in previous stages.
- val grobid = jsonToMap(grobidJson)
- val crossref = jsonToMap(crossrefJson)
-
- val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
- val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
- (computeSimilarity(grobidTitle, crossrefTitle),
- sha1,
- crossref("DOI").asInstanceOf[String],
- "'" + grobidTitle + "'",
- "'" + crossrefTitle + "'")
+ jsonToMap(grobidJson) match {
+ case None => (0, "", "", "", "") // This can't happen, because grobidJson already validated in earlier stage
+ case Some(grobid) => {
+ val grobidTitle = grobid("title").asInstanceOf[String].toLowerCase()
+
+ jsonToMap(crossrefJson) match {
+ case None => (0, "", "", "", "") // This can't happen, because crossrefJson already validated in earlier stage
+ case Some(crossref) => {
+ val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
+
+ (computeSimilarity(grobidTitle, crossrefTitle),
+ sha1,
+ crossref("DOI").asInstanceOf[String],
+ "'" + grobidTitle + "'",
+ "'" + crossrefTitle + "'")
+ }
+ }
+ }
+ }
}
}