aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala18
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala12
2 files changed, 20 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index a22af81..d3e78fe 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -22,7 +22,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
sourceMode = SourceMode.SCAN_ALL)
val grobidPipe = grobidSource
- .read
+ .read
.map('tei_json -> 'slug) {
json : String => HBaseCrossrefScore.grobidToSlug(json)}
@@ -42,17 +42,21 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
}
object HBaseCrossrefScore {
- def grobidToSlug(json : String) = {
+ def grobidToSlug(json : String) : Option[String] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
- val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
- globalMap.get("title") match {
- case Some(title) => titleToSlug(title.asInstanceOf[String])
- case None => ""
+ if (jsonObject == None) {
+ None
+ } else {
+ val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
+ globalMap.get("title") match {
+ case Some(title) => Some(titleToSlug(title.asInstanceOf[String]))
+ case None => None
+ }
}
}
- def titleToSlug(title : String) = {
+ def titleToSlug(title : String) : String = {
title.split(":")(0)
}
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index 186bb70..ab6a798 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -51,6 +51,7 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
}
"""
val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+ val MalformedGrobidString = GrobidString.replace("}", "")
"titleToSlug()" should "extract the parts of titles before a colon" in {
val slug = HBaseCrossrefScore.titleToSlug("hello:there")
@@ -63,11 +64,16 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
"grobidToSlug()" should "get the right slug for a grobid json string" in {
val slug = HBaseCrossrefScore.grobidToSlug(GrobidString)
- slug shouldBe "Dummy Example File"
+ slug should contain ("Dummy Example File")
}
- "grobidToSlug()" should "return empty string for a grobid json string without a title" in {
+ "grobidToSlug()" should "return None if given json string without title" in {
val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle)
- slug shouldBe ""
+ slug shouldBe None
+ }
+
+ "grobidToSlug()" should "return None if given a malformed json string" in {
+ val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString)
+ slug shouldBe None
}
}