aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-22 13:17:49 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-22 13:17:49 -0700
commit9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33 (patch)
tree16680eb1bed750fac2980bda70ced41df28d98c0 /scalding
parent9cc24a40509f62b789ff1fa97913bef32589a288 (diff)
downloadsandcrawler-9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33.tar.gz
sandcrawler-9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33.zip
Added title-length filtering to CrossrefScorable.
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala50
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala36
2 files changed, 71 insertions, 15 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5d1eaf5..0431c63 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -19,29 +19,53 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
.map { CrossrefScorable.jsonToMapFeatures(_) }
}
}
object CrossrefScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) {
+ None
+ } else {
+ val title = titles(0)
+ if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
+ }
+ } else None
+ }
+
def jsonToMapFeatures(json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
case None => MapFeatures(Scorable.NoSlug, json)
- case Some(map) => {
- if ((map contains "title") && (map contains "DOI")) {
- val titles = map("title").asInstanceOf[List[String]]
- val doi = Scorable.getString(map, "DOI")
- if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
- new MapFeatures(Scorable.NoSlug, json)
- } else {
- // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
- new MapFeatures(sf.toSlug, sf.toString)
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(title) => {
+ val doi = Scorable.getString(map, "DOI")
+ if (doi.isEmpty || doi == null) {
+ MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi)
+ MapFeatures(sf.toSlug, sf.toString)
+ }
}
- } else {
- new MapFeatures(Scorable.NoSlug, json)
}
- }
}
}
}
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1789d1a..3d18a21 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,7 +66,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
}
"""
// scalastyle:on
- val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -82,13 +85,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle null title" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle empty title" in {
val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
result.slug shouldBe Scorable.NoSlug
}
it should "handle valid input" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
result.slug shouldBe "sometitle"
Scorable.jsonToMap(result.json) match {
case None => fail()
@@ -97,4 +105,28 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
}
}
}
+
+ "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
}