aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-11 21:03:53 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-11 21:03:53 -0700
commit728e50a33cec921c9a624439f2e1c8561a6e12ce (patch)
tree671548fe0e4bd38badb76453c0a1a90dea5e0ce7 /scalding
parent768e7ef0d127cf55119543be6e656751704ca5b2 (diff)
downloadsandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.tar.gz
sandcrawler-728e50a33cec921c9a624439f2e1c8561a6e12ce.zip
It compiles.
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala54
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala21
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala40
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala26
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala19
5 files changed, 96 insertions, 64 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index b2f6537..5113b0c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -18,6 +18,7 @@ import java.util.regex.Pattern
import scala.math
import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
import cascading.tuple.Fields
import com.twitter.scalding._
@@ -40,33 +41,48 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
- .map{ json : String =>
- CrossrefScorable.simplifyJson(json) match {
- case None => new MapFeatures(Scorable.NoSlug, json)
- case Some(map) => new MapFeatures(
- Scorable.titleToSlug(map("title").asInstanceOf[String]),
- JSONObject(map).toString)
+ .map{ json : String =>
+ Scorable.jsonToMap(json) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(map) => {
+ if ((map contains "title") && (map contains "DOI")) {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty) {
+ new MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val title = titles(0)
+ val map2 = Scorable.toScorableMap(title=titles(0), doi=map("DOI").asInstanceOf[String])
+ new MapFeatures(
+ Scorable.mapToSlug(map2),
+ JSONObject(map2).toString)
+ }
+ } else {
+ new MapFeatures(Scorable.NoSlug, json)
+ }
+ }
}
}
}
+}
- object CrossrefScorable {
- def simplifyJson(json : String) : Option[Map[String, Any]] = {
- Scorable.jsonToMap(json) match {
- case None => None
- case Some(map) => {
- if (map contains "title") {
- val titles = map("title").asInstanceOf[List[String]]
- if (titles.isEmpty) {
- None
- } else {
- Some(Map("title" -> titles(0)))
- }
- } else {
+/*
+object CrossrefScorable {
+ def simplifyJson(json : String) : Option[Map[String, Any]] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty) {
None
+ } else {
+ Some(Map("title" -> titles(0)))
}
+ } else {
+ None
}
}
}
}
}
+ */
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 61055f2..de9f51a 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -1,5 +1,6 @@
package sandcrawler
+import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
import cascading.pipe.Pipe
import cascading.tuple.Fields
@@ -21,13 +22,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
.read
.fromBytesWritable(new Fields("key", "tei_json"))
.toTypedPipe[(String, String)](new Fields("key", "tei_json"))
- .map { entry =>
- val (key : String, json : String) = (entry._1, entry._2)
- GrobidScorable.grobidToSlug(json) match {
- case Some(slug) => new MapFeatures(slug, json)
- case None => new MapFeatures(Scorable.NoSlug, json)
- }
- }
+ .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
}
}
@@ -36,14 +31,18 @@ object GrobidScorable {
HBaseBuilder.build(table, host, List("grobid0:tei_json"), SourceMode.SCAN_ALL)
}
- def grobidToSlug(json : String) : Option[String] = {
+ def jsonToMapFeatures(key : String, json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
- case None => None
+ case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+ val map2 = Scorable.toScorableMap(Scorable.getString(map, "title"),
+ sha1=key)
+ new MapFeatures(
+ Scorable.mapToSlug(map2),
+ JSONObject(map2).toString)
} else {
- None
+ MapFeatures(Scorable.NoSlug, json)
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 0ec8e46..9c8da69 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -2,6 +2,7 @@ package sandcrawler
import scala.math
import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
import com.twitter.scalding._
@@ -36,6 +37,21 @@ object Scorable {
slug != NoSlug
}
+ // NOTE: I could go all out and make ScorableMap a type.
+ // TODO: Require year. Other features will get added here.
+ def toScorableMap(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : Map[String, Any] = {
+ Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ }
+
+ def toScorableJson(title : String, year : Int, doi : String = "", sha1 : String = "") : String = {
+ JSONObject(toScorableMap(title=title, year=year, doi=doi, sha1=sha1)).toString
+ }
+
+ // TODO: Score on more fields than "title".
+ def isScorableMap(map : Map[String, Any]) : Boolean = {
+ map.contains("title")
+ }
+
def jsonToMap(json : String) : Option[Map[String, Any]] = {
// https://stackoverflow.com/a/32717262/631051
val jsonObject = JSON.parseFull(json)
@@ -46,18 +62,17 @@ object Scorable {
}
}
- def titleToSlug(title : String) : String = {
- if (title == null || title.isEmpty) {
+ // Map should have been produced by toScorableMap.
+ // This guarantees it will have all of the fields needed to compute
+ // the ultimate score, which are a superset of those needed for a slug.
+ def mapToSlug(map : Map[String, Any]) : String = {
+ val unaccented = StringUtilities.removeAccents(getString(map, "title"))
+ // Remove punctuation after splitting on colon.
+ val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
+ if (slug.isEmpty || slug == null) {
NoSlug
} else {
- val unaccented = StringUtilities.removeAccents(title)
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
- if (slug.isEmpty || slug == null) {
- NoSlug
- } else {
- slug
- }
+ slug
}
}
@@ -68,8 +83,9 @@ object Scorable {
}
}
- // Caller is responsible for ensuring that key is in map.
- def getString(map : Map[String, String], key : String) : String = {
+ // Caller is responsible for ensuring that key is a String in map.
+ // TODO: Add and handle ClassCastException
+ def getString(map : Map[String, Any], key : String) : String = {
assert(map contains key)
map(key).asInstanceOf[String]
}
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 67a8bfe..1c35d66 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -66,20 +66,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
val MalformedCrossrefString = CrossrefString.replace("}", "")
// Unit tests
-/*
- "crossrefToSlug()" should "get the right slug for a crossref json string" in {
- val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithTitle)
- slug should contain ("sometitle")
+ "simplifyJson()" should "return None for bad JSON" in {
+ CrossrefScorable.simplifyJson("") shouldBe None
+ CrossrefScorable.simplifyJson(MalformedCrossrefString) shouldBe None
}
- it should "return None if given json string without title" in {
- val slug = CrossrefScorable.crossrefToSlug(CrossrefStringWithoutTitle)
- slug shouldBe None
+ it should "return None for JSON lacking title" in {
+ CrossrefScorable.simplifyJson(CrossrefStringWithoutTitle) shouldBe None
}
- it should "return None if given a malformed json string" in {
- val slug = CrossrefScorable.crossrefToSlug(MalformedCrossrefString)
- slug shouldBe None
+ it should "return appropriate result for valid JSON" in {
+ CrossrefScorable.simplifyJson(CrossrefStringWithTitle) match {
+ case None => fail("None unexpectedly returned by simplifyJson")
+ case Some(map) => {
+ Scorable.isScorableMap(map) shouldBe true
+ map.size shouldBe 1
+ map.keys should contain ("title")
+ map("title") shouldBe "SomeTitle"
+ }
+ }
}
- */
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 7777610..5bb955a 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -60,18 +60,15 @@ class GrobidScorableTest extends FlatSpec with Matchers {
// Unit tests
- "grobidToSlug()" should "get the right slug for a grobid json string" in {
- val slug = GrobidScorable.grobidToSlug(GrobidStringWithTitle)
- slug should contain ("dummy example file")
+ "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ val result = GrobidScorable.jsonToMapFeatures(MalformedGrobidString) shouldBe None
+ result.slug shouldBe Scorable.NoSlug
+ result.json shouldBe MalformedGrobidString
}
- it should "return None if given json string without title" in {
- val slug = GrobidScorable.grobidToSlug(GrobidStringWithoutTitle)
- slug shouldBe None
- }
-
- it should "return None if given a malformed json string" in {
- val slug = GrobidScorable.grobidToSlug(MalformedGrobidString)
- slug shouldBe None
+ "GrobidScorable.jsonToMapFeatures()" should "handle missing title" in {
+ val result = GrobidScorable.jsonToMapFeatures(GrobidStringWithoutTitle) shouldBe None
+ result.slug shouldBe Scorable.NoSlug
+ result.json shouldBe GrobidStringWithoutTitle
}
}