diff options
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 18 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 12 | 
2 files changed, 20 insertions, 10 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index a22af81..d3e78fe 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -22,7 +22,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv      sourceMode = SourceMode.SCAN_ALL)    val grobidPipe = grobidSource -   .read +    .read      .map('tei_json -> 'slug) {        json : String => HBaseCrossrefScore.grobidToSlug(json)} @@ -42,17 +42,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv  }  object HBaseCrossrefScore { -  def grobidToSlug(json : String) = { +  def grobidToSlug(json : String) : Option[String] = {      // https://stackoverflow.com/a/32717262/631051      val jsonObject = JSON.parseFull(json) -    val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] -    globalMap.get("title") match { -      case Some(title) => titleToSlug(title.asInstanceOf[String]) -      case None => "" +    if (jsonObject == None) { +      None +    } else { +      val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] +      globalMap.get("title") match { +        case Some(title) => Some(titleToSlug(title.asInstanceOf[String])) +        case None => None +      }      }    } -  def titleToSlug(title : String) = { +  def titleToSlug(title : String) : String = {      title.split(":")(0)    }  } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 186bb70..ab6a798 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -51,6 +51,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {  }  """    val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") +  val MalformedGrobidString = GrobidString.replace("}", "")    "titleToSlug()" should "extract the parts of titles before a colon" in {      val slug = HBaseCrossrefScore.titleToSlug("hello:there") @@ -63,11 +64,16 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {    "grobidToSlug()" should "get the right slug for a grobid json string" in {      val slug = HBaseCrossrefScore.grobidToSlug(GrobidString) -    slug shouldBe "Dummy Example File" +    slug should contain ("Dummy Example File")    } -  "grobidToSlug()" should "return empty string for a grobid json string without a title" in { +  "grobidToSlug()" should "return None if given json string without title" in {      val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle) -    slug shouldBe "" +    slug shouldBe None +  } + +  "grobidToSlug()" should "return None if given a malformed json string" in { +    val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString) +    slug shouldBe None    }  } | 
