diff options
Diffstat (limited to 'scalding')
6 files changed, 70 insertions, 72 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 4558ee6..4897b1c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -34,11 +34,8 @@ object CrossrefScorable {            if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {              new MapFeatures(Scorable.NoSlug, json)            } else { -            val title = titles(0) -            val map2 = Scorable.toScorableMap(title=title, doi=doi) -            new MapFeatures( -              Scorable.mapToSlug(map2), -              JSONObject(map2).toString) +            val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) +            new MapFeatures(sf.toSlug, sf.toString)            }          } else {            new MapFeatures(Scorable.NoSlug, json) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 94b3494..5ba7d58 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -35,11 +35,7 @@ object GrobidScorable {        case None => MapFeatures(Scorable.NoSlug, json)        case Some(map) => {          if (map contains "title") { -          val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), -            sha1=key) -          new MapFeatures( -            Scorable.mapToSlug(map2), -            JSONObject(map2).toString) +          new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures          } else {            MapFeatures(Scorable.NoSlug, json)          } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 717b2d5..9b9c633 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -36,21 +36,6 @@ object Scorable {      slug != NoSlug    } -  // NOTE: I could go all out and make ScorableMap a type. -  // TODO: Require year. Other features will get added here. -  def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { -    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) -  } - -  def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { -    JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString -  } - -  // TODO: Score on more fields than "title". -  def isScorableMap(map : Map[String, Any]) : Boolean = { -    map.contains("title") -  } -    def jsonToMap(json : String) : Option[Map[String, Any]] = {      // https://stackoverflow.com/a/32717262/631051      val jsonObject = JSON.parseFull(json) @@ -61,21 +46,6 @@ object Scorable {      }    } -  // Map should have been produced by toScorableMap. -  // This guarantees it will have all of the fields needed to compute -  // the ultimate score, which are a superset of those needed for a slug. -  def mapToSlug(map : Map[String, Any]) : String = { -    val title = getString(map, "title") -    if (title == null) { -      NoSlug -    } else { -      val unaccented = StringUtilities.removeAccents(title) -      // Remove punctuation after splitting on colon. -      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") -      if (slug.isEmpty || slug == null) NoSlug else slug -    } -  } -    def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {      optionalMap match {        case None => None diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..5d6dea0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,30 @@ +package sandcrawler + +import scala.util.parsing.json.JSONObject + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). +class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { +  def toMap() : Map[String, Any] = { +    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) +  } + +  override def toString() : String = { +    JSONObject(toMap()).toString +  } + +  def toSlug() : String = { +    if (title == null) { +      Scorable.NoSlug +    } else { +      val unaccented = StringUtilities.removeAccents(title) +      // Remove punctuation after splitting on colon. +      val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") +      if (slug.isEmpty || slug == null) Scorable.NoSlug else slug +    } +  } + +  def toMapFeatures = { +    MapFeatures(toSlug, toString) +  } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala new file mode 100644 index 0000000..7ec0c4d --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -0,0 +1,37 @@ +package sandcrawler + +import org.scalatest._ + +class ScorableFeaturesTest extends FlatSpec with Matchers { +  private def titleToSlug(s : String) : String = { +    new ScorableFeatures(title = s).toSlug +  } + +  "mapToSlug()" should "extract the parts of titles before a colon" in { +    titleToSlug("HELLO:there") shouldBe "hello" +  } + +  it should "extract an entire colon-less string" in { +    titleToSlug("hello THERE") shouldBe "hellothere" +  } + +  it should "return Scorable.NoSlug if given empty string" in { +    titleToSlug("") shouldBe Scorable.NoSlug +  } + +  it should "return Scorable.NoSlug if given null" in { +    titleToSlug(null) shouldBe Scorable.NoSlug +  } + +  it should "strip punctuation" in { +    titleToSlug("HELLO!:the:re") shouldBe "hello" +    titleToSlug("a:b:c") shouldBe "a" +    titleToSlug( +      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" +  } + +  it should "remove whitespace" in { +    titleToSlug("foo bar : baz ::") shouldBe "foobar" +    titleToSlug("\na\t:b:c") shouldBe "a" +  } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 95faacc..fd44f57 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,38 +54,6 @@ class ScorableTest extends FlatSpec with Matchers {    "annex": null  }  """ -  private def titleToSlug(s : String) : String = { -    Scorable.mapToSlug(Scorable.toScorableMap(title = s)) -  } - -  "mapToSlug()" should "extract the parts of titles before a colon" in { -    titleToSlug("HELLO:there") shouldBe "hello" -  } - -  it should "extract an entire colon-less string" in { -    titleToSlug("hello THERE") shouldBe "hellothere" -  } - -  it should "return Scorable.NoSlug if given empty string" in { -    titleToSlug("") shouldBe Scorable.NoSlug -  } - -  it should "return Scorable.NoSlug if given null" in { -    titleToSlug(null) shouldBe Scorable.NoSlug -  } - -  it should "strip punctuation" in { -    titleToSlug("HELLO!:the:re") shouldBe "hello" -    titleToSlug("a:b:c") shouldBe "a" -    titleToSlug( -      "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" -  } - -  it should "remove whitespace" in { -    titleToSlug("foo bar : baz ::") shouldBe "foobar" -    titleToSlug("\na\t:b:c") shouldBe "a" -  } -    "jsonToMap()" should "return a map, given a legal JSON string" in {      Scorable.jsonToMap(JsonString) should not be (None)    } | 
