aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala50
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala22
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala2
6 files changed, 43 insertions, 41 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index ff8201a..5d1eaf5 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -35,7 +35,7 @@ object CrossrefScorable {
new MapFeatures(Scorable.NoSlug, json)
} else {
// bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
+ val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
new MapFeatures(sf.toSlug, sf.toString)
}
} else {
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 9a09e05..d7a1eea 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -45,7 +45,7 @@ object GrobidScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures
+ ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures
} else {
MapFeatures(Scorable.NoSlug, json)
}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 610f1a4..0b9868a 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -5,19 +5,31 @@ import java.io.InputStream
import scala.io.Source
import scala.util.parsing.json.JSONObject
-// Contains features needed to make slug and to score (in combination
-// with a second ScorableFeatures).
-class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
- def toMap() : Map[String, Any] = { Map(
- "title" -> (if (title == null) "" else title),
- "year" -> year,
- "doi" -> (if (doi == null) "" else doi),
- "sha1" -> (if (sha1 == null) "" else sha1))
- }
+object ScorableFeatures {
+ // TODO: Add exception handling.
+ val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
+ val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
+ fileStream.close
- override def toString() : String = {
- JSONObject(toMap()).toString
+ // Static factory method
+ def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ new ScorableFeatures(
+ title=if (title == null) "" else title,
+ year=year,
+ doi=if (doi == null) "" else doi,
+ sha1=if (sha1 == null) "" else sha1)
}
+}
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures). Create with above static factory method.
+class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+
+ def toMap() : Map[String, Any] =
+ Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+
+ override def toString() : String =
+ JSONObject(toMap).toString
def toSlug() : String = {
if (title == null) {
@@ -26,22 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- // scalastyle:off if.brace
- if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug))
- Scorable.NoSlug
- else
- slug
+ if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
}
}
- def toMapFeatures : MapFeatures = {
+ def toMapFeatures : MapFeatures =
MapFeatures(toSlug, toString)
- }
-}
-
-object ScorableFeatures {
- // TODO: Add exception handling.
- val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
- val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
- fileStream.close
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
index 0da0b9c..3291670 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala
@@ -29,17 +29,17 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions {
val statusType1Bytes = Bytes.toBytes(statusType1)
val statusType2Bytes = Bytes.toBytes(statusType2)
- val sampleData : List[List[Array[Byte]]] = List(
- // TODO(bnewbold): now to express a null (empty value) in this list?
- List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), statusType1Bytes),
- List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), statusType1Bytes),
- List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), statusType2Bytes),
- List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), statusType2Bytes),
- List(Bytes.toBytes("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ"), statusType2Bytes),
- List(Bytes.toBytes("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6"), statusType2Bytes),
- List(Bytes.toBytes("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ"), statusType1Bytes),
- List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), statusType2Bytes)
- )
+ // TODO(bnewbold): now to express a null (empty value) in this list?
+ val sampleData : List[List[Array[Byte]]] = List(
+ ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes),
+ ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes),
+ ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes),
+ ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes),
+ ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes),
+ ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes),
+ ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes),
+ ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes))
+ .map(pair => List(Bytes.toBytes(pair._1), pair._2))
val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes)
val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes)
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index a9a90ec..5a22ef8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -26,12 +26,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
private def titleToSlug(s : String) : String = {
- new ScorableFeatures(title = s).toSlug
+ ScorableFeatures.create(title = s).toSlug
}
"toMapFeatures()" should "work with gnarly inputs" in {
- new ScorableFeatures(title = null).toMapFeatures
- new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
+ ScorableFeatures.create(title = null).toMapFeatures
+ ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
}
"mapToSlug()" should "extract the parts of titles before a colon" in {
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 5516869..00e4659 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -161,10 +161,12 @@ class ScoreJobTest extends FlatSpec with Matchers {
.map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
.map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
+ // scalastyle:off null
// Add example of lines without GROBID data
// scalastyle:off null
val SampleData = SampleDataHead :+ new Tuple(
new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
+ // scalastyle:on null
JobTest("sandcrawler.ScoreJob")
.arg("test", "")