aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-13 10:27:48 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-13 10:27:48 -0700
commitb4f1acce5eccbb56291f82906d9c01534c7f1506 (patch)
tree96ff33ed95a4eb9304280b1d5f1ccb269c0d0424 /scalding
parent1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb (diff)
downloadsandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.tar.gz
sandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.zip
Factored out ScorableFeatures.
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala7
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala6
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala30
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala30
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala37
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableTest.scala32
6 files changed, 70 insertions, 72 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 4558ee6..4897b1c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -34,11 +34,8 @@ object CrossrefScorable {
if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
new MapFeatures(Scorable.NoSlug, json)
} else {
- val title = titles(0)
- val map2 = Scorable.toScorableMap(title=title, doi=doi)
- new MapFeatures(
- Scorable.mapToSlug(map2),
- JSONObject(map2).toString)
+ val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
+ new MapFeatures(sf.toSlug, sf.toString)
}
} else {
new MapFeatures(Scorable.NoSlug, json)
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 94b3494..5ba7d58 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -35,11 +35,7 @@ object GrobidScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"),
- sha1=key)
- new MapFeatures(
- Scorable.mapToSlug(map2),
- JSONObject(map2).toString)
+ new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures
} else {
MapFeatures(Scorable.NoSlug, json)
}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 717b2d5..9b9c633 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -36,21 +36,6 @@ object Scorable {
slug != NoSlug
}
- // NOTE: I could go all out and make ScorableMap a type.
- // TODO: Require year. Other features will get added here.
- def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = {
- Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
- }
-
- def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = {
- JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString
- }
-
- // TODO: Score on more fields than "title".
- def isScorableMap(map : Map[String, Any]) : Boolean = {
- map.contains("title")
- }
-
def jsonToMap(json : String) : Option[Map[String, Any]] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
@@ -61,21 +46,6 @@ object Scorable {
}
}
- // Map should have been produced by toScorableMap.
- // This guarantees it will have all of the fields needed to compute
- // the ultimate score, which are a superset of those needed for a slug.
- def mapToSlug(map : Map[String, Any]) : String = {
- val title = getString(map, "title")
- if (title == null) {
- NoSlug
- } else {
- val unaccented = StringUtilities.removeAccents(title)
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null) NoSlug else slug
- }
- }
-
def getStringOption(optionalMap : Option[Map[String, Any]], key : String) : Option[String] = {
optionalMap match {
case None => None
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..5d6dea0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,30 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures).
+class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+ def toMap() : Map[String, Any] = {
+ Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ }
+
+ override def toString() : String = {
+ JSONObject(toMap()).toString
+ }
+
+ def toSlug() : String = {
+ if (title == null) {
+ Scorable.NoSlug
+ } else {
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation after splitting on colon.
+ val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty || slug == null) Scorable.NoSlug else slug
+ }
+ }
+
+ def toMapFeatures = {
+ MapFeatures(toSlug, toString)
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
new file mode 100644
index 0000000..7ec0c4d
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -0,0 +1,37 @@
+package sandcrawler
+
+import org.scalatest._
+
+class ScorableFeaturesTest extends FlatSpec with Matchers {
+ private def titleToSlug(s : String) : String = {
+ new ScorableFeatures(title = s).toSlug
+ }
+
+ "mapToSlug()" should "extract the parts of titles before a colon" in {
+ titleToSlug("HELLO:there") shouldBe "hello"
+ }
+
+ it should "extract an entire colon-less string" in {
+ titleToSlug("hello THERE") shouldBe "hellothere"
+ }
+
+ it should "return Scorable.NoSlug if given empty string" in {
+ titleToSlug("") shouldBe Scorable.NoSlug
+ }
+
+ it should "return Scorable.NoSlug if given null" in {
+ titleToSlug(null) shouldBe Scorable.NoSlug
+ }
+
+ it should "strip punctuation" in {
+ titleToSlug("HELLO!:the:re") shouldBe "hello"
+ titleToSlug("a:b:c") shouldBe "a"
+ titleToSlug(
+ "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
+ }
+
+ it should "remove whitespace" in {
+ titleToSlug("foo bar : baz ::") shouldBe "foobar"
+ titleToSlug("\na\t:b:c") shouldBe "a"
+ }
+}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableTest.scala b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
index 95faacc..fd44f57 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableTest.scala
@@ -54,38 +54,6 @@ class ScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
- private def titleToSlug(s : String) : String = {
- Scorable.mapToSlug(Scorable.toScorableMap(title = s))
- }
-
- "mapToSlug()" should "extract the parts of titles before a colon" in {
- titleToSlug("HELLO:there") shouldBe "hello"
- }
-
- it should "extract an entire colon-less string" in {
- titleToSlug("hello THERE") shouldBe "hellothere"
- }
-
- it should "return Scorable.NoSlug if given empty string" in {
- titleToSlug("") shouldBe Scorable.NoSlug
- }
-
- it should "return Scorable.NoSlug if given null" in {
- titleToSlug(null) shouldBe Scorable.NoSlug
- }
-
- it should "strip punctuation" in {
- titleToSlug("HELLO!:the:re") shouldBe "hello"
- titleToSlug("a:b:c") shouldBe "a"
- titleToSlug(
- "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
- }
-
- it should "remove whitespace" in {
- titleToSlug("foo bar : baz ::") shouldBe "foobar"
- titleToSlug("\na\t:b:c") shouldBe "a"
- }
-
"jsonToMap()" should "return a map, given a legal JSON string" in {
Scorable.jsonToMap(JsonString) should not be (None)
}