diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 14:27:33 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 14:27:33 -0700 |
commit | 07edf1ccad9c3268324926471dd0c8a7433f0c08 (patch) | |
tree | 559f773d04fe89ea31bb25d1fb6c02d963766962 /scalding | |
parent | 8a63e05c18bbf84dddccd5596f9e0aefbf469789 (diff) | |
download | sandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.tar.gz sandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.zip |
Clean-up
Diffstat (limited to 'scalding')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 42 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 5 |
2 files changed, 28 insertions, 19 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 30f76a0..12660e8 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -41,34 +41,42 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv } object HBaseCrossrefScore { - def grobidToSlug(json : String) : Option[String] = { + def jsonToMap(json : String) : Map[String, Any] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) if (jsonObject == None) { - None + // Empty map for malformed JSON + Map[String, Any]() } else { - val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] - globalMap.get("title") match { - case Some(title) => Some(titleToSlug(title.asInstanceOf[String])) - case None => None - } + jsonObject.get.asInstanceOf[Map[String, Any]] } } - def crossrefToSlug(json : String) : Option[String] = { - val jsonObject = JSON.parseFull(json) - if (jsonObject == None) { + + def grobidToSlug(json : String) : Option[String] = { + val map = jsonToMap(json) + if (map contains "title") { + titleToSlug(map("title").asInstanceOf[String]) + } else { None + } + } + + def crossrefToSlug(json : String) : Option[String] = { + val map = jsonToMap(json) + if (map contains "title") { + titleToSlug(map("title").asInstanceOf[List[String]](0)) } else { - val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] - globalMap.get("title") match { - case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0))) - case None => None - } + None } } - def titleToSlug(title : String) : String = { - title.split(":")(0).toLowerCase() + def titleToSlug(title : String) : Option[String] = { + val slug = title.split(":")(0).toLowerCase() + if (slug.isEmpty) { + None + } else { + Some(slug) + } } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 8bdc7a8..a59b278 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -110,11 +110,12 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "titleToSlug()" should "extract the parts of titles before a colon" in { val slug = HBaseCrossrefScore.titleToSlug("HELLO:there") - slug shouldBe "hello" + slug should contain ("hello") } + it should "extract an entire colon-less string" in { val slug = HBaseCrossrefScore.titleToSlug("hello THERE") - slug shouldBe "hello there" + slug should contain ("hello there") } "grobidToSlug()" should "get the right slug for a grobid json string" in { |