aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 20:22:04 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 20:22:04 -0700
commit419ca3dc053682d688653e9a64eaaf46018fd330 (patch)
treed65a9fbb6d97957269fdcf53ea51e422d1cb6a08 /scalding
parentdf341a68459829380f1f01015768acee5642f15b (diff)
downloadsandcrawler-419ca3dc053682d688653e9a64eaaf46018fd330.tar.gz
sandcrawler-419ca3dc053682d688653e9a64eaaf46018fd330.zip
scorable: test for null strings
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala1
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala5
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala5
3 files changed, 10 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 4897b1c..ff8201a 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -34,6 +34,7 @@ object CrossrefScorable {
if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
new MapFeatures(Scorable.NoSlug, json)
} else {
+ // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
new MapFeatures(sf.toSlug, sf.toString)
}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 5d6dea0..966fb93 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -6,7 +6,10 @@ import scala.util.parsing.json.JSONObject
// with a second ScorableFeatures).
class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
def toMap() : Map[String, Any] = {
- Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ Map("title" -> (if (title == null) "" else title),
+ "year" -> year,
+ "doi" -> (if (doi == null) "" else doi),
+ "sha1" -> (if (sha1 == null) "" else sha1))
}
override def toString() : String = {
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index f9c30a2..5ffc305 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -8,6 +8,11 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
new ScorableFeatures(title = s).toSlug
}
+ "toMapFeatures()" should "work with gnarly inputs" in {
+ new ScorableFeatures(title = null).toMapFeatures
+ new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
+ }
+
"mapToSlug()" should "extract the parts of titles before a colon" in {
titleToSlug("HELLO:there") shouldBe "hello"
}