aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-07-24 14:27:33 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-07-24 14:27:33 -0700
commit07edf1ccad9c3268324926471dd0c8a7433f0c08 (patch)
tree559f773d04fe89ea31bb25d1fb6c02d963766962 /scalding
parent8a63e05c18bbf84dddccd5596f9e0aefbf469789 (diff)
downloadsandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.tar.gz
sandcrawler-07edf1ccad9c3268324926471dd0c8a7433f0c08.zip
Clean-up
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala42
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala5
2 files changed, 28 insertions, 19 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 30f76a0..12660e8 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -41,34 +41,42 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
}
object HBaseCrossrefScore {
- def grobidToSlug(json : String) : Option[String] = {
+ def jsonToMap(json : String) : Map[String, Any] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
if (jsonObject == None) {
- None
+ // Empty map for malformed JSON
+ Map[String, Any]()
} else {
- val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
- globalMap.get("title") match {
- case Some(title) => Some(titleToSlug(title.asInstanceOf[String]))
- case None => None
- }
+ jsonObject.get.asInstanceOf[Map[String, Any]]
}
}
- def crossrefToSlug(json : String) : Option[String] = {
- val jsonObject = JSON.parseFull(json)
- if (jsonObject == None) {
+
+ def grobidToSlug(json : String) : Option[String] = {
+ val map = jsonToMap(json)
+ if (map contains "title") {
+ titleToSlug(map("title").asInstanceOf[String])
+ } else {
None
+ }
+ }
+
+ def crossrefToSlug(json : String) : Option[String] = {
+ val map = jsonToMap(json)
+ if (map contains "title") {
+ titleToSlug(map("title").asInstanceOf[List[String]](0))
} else {
- val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
- globalMap.get("title") match {
- case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0)))
- case None => None
- }
+ None
}
}
- def titleToSlug(title : String) : String = {
- title.split(":")(0).toLowerCase()
+ def titleToSlug(title : String) : Option[String] = {
+ val slug = title.split(":")(0).toLowerCase()
+ if (slug.isEmpty) {
+ None
+ } else {
+ Some(slug)
+ }
}
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index 8bdc7a8..a59b278 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -110,11 +110,12 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
"titleToSlug()" should "extract the parts of titles before a colon" in {
val slug = HBaseCrossrefScore.titleToSlug("HELLO:there")
- slug shouldBe "hello"
+ slug should contain ("hello")
}
+
it should "extract an entire colon-less string" in {
val slug = HBaseCrossrefScore.titleToSlug("hello THERE")
- slug shouldBe "hello there"
+ slug should contain ("hello there")
}
"grobidToSlug()" should "get the right slug for a grobid json string" in {