diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 12:25:45 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 12:25:45 -0700 |
commit | dae965840db388c53b969d76849e5e8e9569ceee (patch) | |
tree | cf015ebe7d55ae6b9bc1a1272a57fd0d25e86014 /scalding/src | |
parent | 3e33d60aac9db78d0458876fbe987627db222bbb (diff) | |
download | sandcrawler-dae965840db388c53b969d76849e5e8e9569ceee.tar.gz sandcrawler-dae965840db388c53b969d76849e5e8e9569ceee.zip |
Changed return type of grobidToSlug() to Option[String].
Diffstat (limited to 'scalding/src')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 18 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 12 |
2 files changed, 20 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index a22af81..d3e78fe 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -22,7 +22,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv sourceMode = SourceMode.SCAN_ALL) val grobidPipe = grobidSource - .read + .read .map('tei_json -> 'slug) { json : String => HBaseCrossrefScore.grobidToSlug(json)} @@ -42,17 +42,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv } object HBaseCrossrefScore { - def grobidToSlug(json : String) = { + def grobidToSlug(json : String) : Option[String] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) - val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] - globalMap.get("title") match { - case Some(title) => titleToSlug(title.asInstanceOf[String]) - case None => "" + if (jsonObject == None) { + None + } else { + val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] + globalMap.get("title") match { + case Some(title) => Some(titleToSlug(title.asInstanceOf[String])) + case None => None + } } } - def titleToSlug(title : String) = { + def titleToSlug(title : String) : String = { title.split(":")(0) } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 186bb70..ab6a798 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -51,6 +51,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { } """ val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") + val MalformedGrobidString = GrobidString.replace("}", "") "titleToSlug()" should "extract the parts of titles before a colon" in { val slug = HBaseCrossrefScore.titleToSlug("hello:there") @@ -63,11 +64,16 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { "grobidToSlug()" should "get the right slug for a grobid json string" in { val slug = HBaseCrossrefScore.grobidToSlug(GrobidString) - slug shouldBe "Dummy Example File" + slug should contain ("Dummy Example File") } - "grobidToSlug()" should "return empty string for a grobid json string without a title" in { + "grobidToSlug()" should "return None if given json string without title" in { val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle) - slug shouldBe "" + slug shouldBe None + } + + "grobidToSlug()" should "return None if given a malformed json string" in { + val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString) + slug shouldBe None } } |