diff options
Diffstat (limited to 'scalding/src')
3 files changed, 10 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 4897b1c..ff8201a 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -34,6 +34,7 @@ object CrossrefScorable { if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { new MapFeatures(Scorable.NoSlug, json) } else { + // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 5d6dea0..966fb93 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -6,7 +6,10 @@ import scala.util.parsing.json.JSONObject // with a second ScorableFeatures). class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { def toMap() : Map[String, Any] = { - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> (if (title == null) "" else title), + "year" -> year, + "doi" -> (if (doi == null) "" else doi), + "sha1" -> (if (sha1 == null) "" else sha1)) } override def toString() : String = { diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index f9c30a2..5ffc305 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -8,6 +8,11 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { new ScorableFeatures(title = s).toSlug } + "toMapFeatures()" should "work with gnarly inputs" in { + new ScorableFeatures(title = null).toMapFeatures + new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures + } + "mapToSlug()" should "extract the parts of titles before a colon" in { titleToSlug("HELLO:there") shouldBe "hello" } |