diff options
Diffstat (limited to 'scalding/src')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 50 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala | 36 | 
2 files changed, 71 insertions, 15 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5d1eaf5..0431c63 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -19,29 +19,53 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {    def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {      getSource(args).read        .toTypedPipe[String](new Fields("line")) +      .filter { CrossrefScorable.keepRecord(_) }        .map { CrossrefScorable.jsonToMapFeatures(_) }    }  }  object CrossrefScorable { +  def keepRecord(json : String) : Boolean = { +    Scorable.jsonToMap(json) match { +      case None => false +      case Some(map) => { +        mapToTitle(map) match { +          case None => false +          case Some(title) => title.length <= Scorable.MaxTitleLength +        } +      } +    } +  } + +  // Returns None if title is null, empty, or too long. +  def mapToTitle(map : Map[String, Any]) : Option[String] = { +    if (map contains "title") { +      val titles = map("title").asInstanceOf[List[String]] +      if (titles.isEmpty || titles == null) { +        None +      } else { +        val title = titles(0) +        if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) +      } +    } else None +  } +    def jsonToMapFeatures(json : String) : MapFeatures = {      Scorable.jsonToMap(json) match {        case None => MapFeatures(Scorable.NoSlug, json) -      case Some(map) => { -        if ((map contains "title") && (map contains "DOI")) { -          val titles = map("title").asInstanceOf[List[String]] -          val doi = Scorable.getString(map, "DOI") -          if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { -            new MapFeatures(Scorable.NoSlug, json) -          } else { -            // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] -            val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) -            new MapFeatures(sf.toSlug, sf.toString) +      case Some(map) =>  +        mapToTitle(map) match { +          case None => MapFeatures(Scorable.NoSlug, json) +          case Some(title) => { +            val doi = Scorable.getString(map, "DOI") +            if (doi.isEmpty || doi == null) { +              MapFeatures(Scorable.NoSlug, json) +            } else { +              val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) +              MapFeatures(sf.toSlug, sf.toString) +            }            } -        } else { -          new MapFeatures(Scorable.NoSlug, json)          } -      }      }    }  } diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 1789d1a..3d18a21 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -66,7 +66,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers {  }  """    // scalastyle:on -  val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") +  val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title") +  val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) +  val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") +  val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")    val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")    val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")    val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -82,13 +85,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers {      result.slug shouldBe Scorable.NoSlug    } +  it should "handle null title" in { +    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) +    result.slug shouldBe Scorable.NoSlug +  } +    it should "handle empty title" in {      val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)      result.slug shouldBe Scorable.NoSlug    }    it should "handle valid input" in { -    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) +    val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)      result.slug shouldBe "sometitle"      Scorable.jsonToMap(result.json) match {        case None => fail() @@ -97,4 +105,28 @@ class CrossrefScorableTest extends FlatSpec with Matchers {        }      }    } + +  "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in { +    CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true +  } + +  it should "return true for valid JSON with a title of maximum permitted length" in { +    CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true +  } + +  it should "return false for valid JSON with excessively long title" in { +    CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false +  } + +  it should "return false for valid JSON with null title" in { +    CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false +  } + +  it should "return false for valid JSON with no title" in { +    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false +  } + +  it should "return false for invalid JSON" in { +    CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false +  }  } | 
