From 9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33 Mon Sep 17 00:00:00 2001 From: Ellen Spertus Date: Wed, 22 Aug 2018 13:17:49 -0700 Subject: Added title-length filtering to CrossrefScorable. --- .../main/scala/sandcrawler/CrossrefScorable.scala | 50 ++++++++++++++++------ .../scala/sandcrawler/CrossrefScorableTest.scala | 36 +++++++++++++++- 2 files changed, 71 insertions(+), 15 deletions(-) (limited to 'scalding') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5d1eaf5..0431c63 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -19,29 +19,53 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) + .filter { CrossrefScorable.keepRecord(_) } .map { CrossrefScorable.jsonToMapFeatures(_) } } } object CrossrefScorable { + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + mapToTitle(map) match { + case None => false + case Some(title) => title.length <= Scorable.MaxTitleLength + } + } + } + } + + // Returns None if title is null, empty, or too long. + def mapToTitle(map : Map[String, Any]) : Option[String] = { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty || titles == null) { + None + } else { + val title = titles(0) + if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) + } + } else None + } + def jsonToMapFeatures(json : String) : MapFeatures = { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) - case Some(map) => { - if ((map contains "title") && (map contains "DOI")) { - val titles = map("title").asInstanceOf[List[String]] - val doi = Scorable.getString(map, "DOI") - if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { - new MapFeatures(Scorable.NoSlug, json) - } else { - // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) - new MapFeatures(sf.toSlug, sf.toString) + case Some(map) => + mapToTitle(map) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(title) => { + val doi = Scorable.getString(map, "DOI") + if (doi.isEmpty || doi == null) { + MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) + MapFeatures(sf.toSlug, sf.toString) + } } - } else { - new MapFeatures(Scorable.NoSlug, json) } - } } } } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 1789d1a..3d18a21 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,7 +66,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers { } """ // scalastyle:on - val CrossrefStringWithTitle = CrossrefString.replace("<>", "Some Title") + val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title") + val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) + val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") + val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null") val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -82,13 +85,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle null title" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) + result.slug shouldBe Scorable.NoSlug + } + it should "handle empty title" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) result.slug shouldBe Scorable.NoSlug } it should "handle valid input" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) result.slug shouldBe "sometitle" Scorable.jsonToMap(result.json) match { case None => fail() @@ -97,4 +105,28 @@ class CrossrefScorableTest extends FlatSpec with Matchers { } } } + + "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in { + CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true + } + + it should "return true for valid JSON with a title of maximum permitted length" in { + CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true + } + + it should "return false for valid JSON with excessively long title" in { + CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false + } + + it should "return false for valid JSON with null title" in { + CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false + } + + it should "return false for valid JSON with no title" in { + CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false + } + + it should "return false for invalid JSON" in { + CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false + } } -- cgit v1.2.3