diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 10:27:48 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 10:27:48 -0700 |
commit | b4f1acce5eccbb56291f82906d9c01534c7f1506 (patch) | |
tree | 96ff33ed95a4eb9304280b1d5f1ccb269c0d0424 /scalding | |
parent | 1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb (diff) | |
download | sandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.tar.gz sandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.zip |
Factored out ScorableFeatures.
Diffstat (limited to 'scalding')
6 files changed, 70 insertions, 72 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 4558ee6..4897b1c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -34,11 +34,8 @@ object CrossrefScorable { if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { new MapFeatures(Scorable.NoSlug, json) } else { - val title = titles(0) - val map2 = Scorable.toScorableMap(title=title, doi=doi) - new MapFeatures( - Scorable.mapToSlug(map2), - JSONObject(map2).toString) + val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + new MapFeatures(sf.toSlug, sf.toString) } } else { new MapFeatures(Scorable.NoSlug, json) diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 94b3494..5ba7d58 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -35,11 +35,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"), - sha1=key) - new MapFeatures( - Scorable.mapToSlug(map2), - JSONObject(map2).toString) + new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 717b2d5..9b9c633 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -36,21 +36,6 @@ object Scorable { slug != NoSlug } - // NOTE: I could go all out and make ScorableMap a type. - // TODO: Require year. Other features will get added here. - def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = { - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) - } - - def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = { - JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString - } - - // TODO: Score on more fields than "title". - def isScorableMap(map : Map[String, Any]) : Boolean = { - map.contains("title") - } - def jsonToMap(json : String) : Option[Map[String, Any]] = { // https://stackoverflow.com/a/32717262/631051 val jsonObject = JSON.parseFull(json) @@ -61,21 +46,6 @@ object Scorable { } } - // Map should have been produced by toScorableMap. - // This guarantees it will have all of the fields needed to compute - // the ultimate score, which are a superset of those needed for a slug. - def mapToSlug(map : Map[String, Any]) : String = { - val title = getString(map, "title") - if (title == null) { - NoSlug - } else { - val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null) NoSlug else slug - } - } - def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = { optionalMap match { case None => None diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..5d6dea0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,30 @@ +package sandcrawler + +import scala.util.parsing.json.JSONObject + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). +class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + def toMap() : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + override def toString() : String = { + JSONObject(toMap()).toString + } + + def toSlug() : String = { + if (title == null) { + Scorable.NoSlug + } else { + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + } + } + + def toMapFeatures = { + MapFeatures(toSlug, toString) + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala new file mode 100644 index 0000000..7ec0c4d --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -0,0 +1,37 @@ +package sandcrawler + +import org.scalatest._ + +class ScorableFeaturesTest extends FlatSpec with Matchers { + private def titleToSlug(s : String) : String = { + new ScorableFeatures(title = s).toSlug + } + + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hello" + } + + it should "extract an entire colon-less string" in { + titleToSlug("hello THERE") shouldBe "hellothere" + } + + it should "return Scorable.NoSlug if given empty string" in { + titleToSlug("") shouldBe Scorable.NoSlug + } + + it should "return Scorable.NoSlug if given null" in { + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hello" + titleToSlug("a:b:c") shouldBe "a" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" + } + + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobar" + titleToSlug("\na\t:b:c") shouldBe "a" + } +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala index 95faacc..fd44f57 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala @@ -54,38 +54,6 @@ class ScorableTest extends FlatSpec with Matchers { "annex": null } """ - private def titleToSlug(s : String) : String = { - Scorable.mapToSlug(Scorable.toScorableMap(title = s)) - } - - "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hello" - } - - it should "extract an entire colon-less string" in { - titleToSlug("hello THERE") shouldBe "hellothere" - } - - it should "return Scorable.NoSlug if given empty string" in { - titleToSlug("") shouldBe Scorable.NoSlug - } - - it should "return Scorable.NoSlug if given null" in { - titleToSlug(null) shouldBe Scorable.NoSlug - } - - it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hello" - titleToSlug("a:b:c") shouldBe "a" - titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" - } - - it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobar" - titleToSlug("\na\t:b:c") shouldBe "a" - } - "jsonToMap()" should "return a map, given a legal JSON string" in { Scorable.jsonToMap(JsonString) should not be (None) } |