diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 14:27:33 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-07-24 14:27:33 -0700 | 
| commit | 07edf1ccad9c3268324926471dd0c8a7433f0c08 (patch) | |
| tree | 559f773d04fe89ea31bb25d1fb6c02d963766962 /scalding | |
| parent | 8a63e05c18bbf84dddccd5596f9e0aefbf469789 (diff) | |
| download | sandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.tar.gz sandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.zip | |
Clean-up
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 42 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 5 | 
2 files changed, 28 insertions, 19 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index 30f76a0..12660e8 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -41,34 +41,42 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv  }  object HBaseCrossrefScore { -  def grobidToSlug(json : String) : Option[String] = { +  def jsonToMap(json : String) : Map[String, Any] = {      // https://stackoverflow.com/a/32717262/631051      val jsonObject = JSON.parseFull(json)      if (jsonObject == None) { -      None +      // Empty map for malformed JSON +      Map[String, Any]()      } else { -      val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] -      globalMap.get("title") match { -        case Some(title) => Some(titleToSlug(title.asInstanceOf[String])) -        case None => None -      } +      jsonObject.get.asInstanceOf[Map[String, Any]]      }    } -  def crossrefToSlug(json : String) : Option[String] = { -    val jsonObject = JSON.parseFull(json) -    if (jsonObject == None) { + +  def grobidToSlug(json : String) : Option[String] = { +    val map = jsonToMap(json) +    if (map contains "title") { +      titleToSlug(map("title").asInstanceOf[String]) +    } else {        None +    } +  } + +  def crossrefToSlug(json : String) : Option[String] = { +    val map = jsonToMap(json) +    if (map contains "title") { +      titleToSlug(map("title").asInstanceOf[List[String]](0))      } else { -      val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] -      globalMap.get("title") match { -        case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0))) -        case None => None -      } +      None      }    } -  def titleToSlug(title : String) : String = { -    title.split(":")(0).toLowerCase() +  def titleToSlug(title : String) : Option[String] = { +    val slug = title.split(":")(0).toLowerCase() +    if (slug.isEmpty) { +      None +    } else { +      Some(slug) +    }    }  } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index 8bdc7a8..a59b278 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -110,11 +110,12 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {    "titleToSlug()" should "extract the parts of titles before a colon" in {      val slug = HBaseCrossrefScore.titleToSlug("HELLO:there") -    slug shouldBe "hello" +    slug should contain ("hello")    } +    it should "extract an entire colon-less string" in {      val slug = HBaseCrossrefScore.titleToSlug("hello THERE") -    slug shouldBe "hello there" +    slug should contain ("hello there")    }    "grobidToSlug()" should "get the right slug for a grobid json string" in { | 
