aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala41
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala1
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala24
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala1
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala15
5 files changed, 46 insertions, 36 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 667a5cc..e257152 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -41,26 +41,31 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
- .map{ json : String =>
- Scorable.jsonToMap(json) match {
- case None => MapFeatures(Scorable.NoSlug, json)
- case Some(map) => {
- if ((map contains "title") && (map contains "DOI")) {
- val titles = map("title").asInstanceOf[List[String]]
- if (titles.isEmpty) {
- new MapFeatures(Scorable.NoSlug, json)
- } else {
- val title = titles(0)
- val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String])
- new MapFeatures(
- Scorable.mapToSlug(map2),
- JSONObject(map2).toString)
- }
- } else {
- new MapFeatures(Scorable.NoSlug, json)
- }
+ .map { CrossrefScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object CrossrefScorable {
+ def jsonToMapFeatures(json : String) : MapFeatures = {
+ Scorable.jsonToMap(json) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(map) => {
+ if ((map contains "titles") && (map contains "DOI")) {
+ val titles = map("titles").asInstanceOf[List[String]]
+ val doi = Scorable.getString(map, "DOI")
+ if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
+ new MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val title = titles(0)
+ val map2 = Scorable.toScorableMap(title=title, doi=doi)
+ new MapFeatures(
+ Scorable.mapToSlug(map2),
+ JSONObject(map2).toString)
}
+ } else {
+ new MapFeatures(Scorable.NoSlug, json)
}
}
+ }
}
}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 929461b..a256fa4 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -7,7 +7,6 @@ import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
import com.twitter.scalding._
import com.twitter.scalding.typed.TDsl._
-//import TDsl._
case class MapFeatures(slug : String, json : String)
case class ReduceFeatures(json : String)
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1c35d66..dc6f347 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,23 +66,23 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
val MalformedCrossrefString = CrossrefString.replace("}", "")
// Unit tests
- "simplifyJson()" should "return None for bad JSON" in {
- CrossrefScorable.simplifyJson("") shouldBe None
- CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
+ "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString)
+ result.slug shouldBe Scorable.NoSlug
}
- it should "return None for JSON lacking title" in {
- CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
+ it should "handle missing title" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle)
+ result.slug shouldBe Scorable.NoSlug
}
- it should "return appropriate result for valid JSON" in {
- CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
- case None => fail("None unexpectedly returned by simplifyJson")
+ it should "handle valid input" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+ result.slug shouldBe "dummyexamplefile"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
case Some(map) => {
- Scorable.isScorableMap(map) shouldBe true
- map.size shouldBe 1
- map.keys should contain ("title")
- map("title") shouldBe "SomeTitle"
+ map("title").asInstanceOf[String] shouldBe "Dummy Example File"
}
}
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 3fcd856..4b958b9 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -77,6 +77,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {
Scorable.jsonToMap(result.json) match {
case None => fail()
case Some(map) => {
+ map should contain key "title"
map("title").asInstanceOf[String] shouldBe "Dummy Example File"
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 8acb454..8436817 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -149,11 +149,16 @@ class ScoreJobTest extends FlatSpec with Matchers {
2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
- // Grobid titles:
- // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
- // crossref slugs:
- // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
- // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
+ // Grobid titles and slugs (in parentheses):
+ // Title 1 (title1)
+ // Title 2: TNG (title2)
+ // Title 3: The Sequel (title3)
+ // crossref titles and slugs (in parentheses):
+ // Title 1: TNG (title1)
+ // Title 1: TNG 2 (title1)
+ // Title 1: TNG 3 (title1)
+ // Title 2 Rebooted (title2rebooted)
+ // Join should have 3 "title1" slugs and 1 "title2" slug
outputBuffer =>
"The pipeline" should "return a 4-element list" in {
outputBuffer should have length 4