aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/BibjsonScorable.scala15
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala105
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala8
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala8
-rw-r--r--scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala3
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala17
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala21
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala67
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala27
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala45
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala17
11 files changed, 179 insertions, 154 deletions
diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
index 0d26d75..abf9220 100644
--- a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
@@ -15,7 +15,7 @@ class BibjsonScorable extends Scorable {
TextLine(args("bibjson-input"))
}
- def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
.map { BibjsonScorable.bibjsonToMapFeatures(_) }
@@ -23,9 +23,9 @@ class BibjsonScorable extends Scorable {
}
object BibjsonScorable {
- def bibjsonToMapFeatures(json : String) : MapFeatures = {
+ def bibjsonToMapFeatures(json : String) : Option[MapFeatures] = {
Scorable.jsonToMap(json) match {
- case None => MapFeatures(Scorable.NoSlug, json)
+ case None => None
case Some(map) => {
if (map contains "title") {
val title = Scorable.getString(map, "title")
@@ -33,13 +33,16 @@ object BibjsonScorable {
val sha1 = Scorable.getString(map, "sha")
// TODO: year, authors (if available)
if (title == null || title.isEmpty) {
- new MapFeatures(Scorable.NoSlug, json)
+ None
} else {
val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
- new MapFeatures(sf.toSlug, sf.toString)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
}
} else {
- new MapFeatures(Scorable.NoSlug, json)
+ None
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index f51c210..bb6413f 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -17,7 +17,7 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
TextLine(args("crossref-input"))
}
- def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
.filter { CrossrefScorable.keepRecord(_) }
@@ -55,37 +55,45 @@ object CrossrefScorable {
// Returns None if title is null, empty, or too long.
def mapToTitle(map : Map[String, Any]) : Option[String] = {
- if (map contains "title") {
- val titles = map("title").asInstanceOf[List[String]]
- if (titles.isEmpty || titles == null) {
- None
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) None else Some(titles(0))
} else {
- val baseTitle: String = titles(0)
- // TODO(bnewbold): this code block is horrible
- val baseSubtitle: String = if (map contains "subtitle") {
- val subtitles = map("subtitle").asInstanceOf[List[String]]
- if (!subtitles.isEmpty && subtitles != null) {
- val sub = subtitles(0)
- if (sub != null && !sub.isEmpty && baseTitle != null) {
- sub
- } else {
- ""
- }
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitles = map("subtitle").asInstanceOf[List[String]]
+ if (subtitles.isEmpty || subtitles == null) {
+ None
+ } else {
+ val sub = subtitles(0)
+ if (sub == null || sub.isEmpty) {
+ None
} else {
- ""
+ Some(sub)
}
- } else {
- ""
}
- val title = if (baseSubtitle.isEmpty) {
- baseTitle
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
} else {
- baseTitle.concat(": ".concat(baseSubtitle))
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
}
- if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
}
- } else {
- None
}
}
@@ -106,34 +114,39 @@ object CrossrefScorable {
case None => None
case Some(created) => {
Some(created.asInstanceOf[Map[String,Any]]
- .get("date-parts")
- .get
- .asInstanceOf[List[Any]](0)
- .asInstanceOf[List[Any]](0)
- .asInstanceOf[Double]
- .toInt)
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
}
}
}
- def jsonToMapFeatures(json : String) : MapFeatures = {
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ None
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ }
Scorable.jsonToMap(json) match {
- case None => MapFeatures(Scorable.NoSlug, json)
+ case None => None
case Some(map) =>
mapToTitle(map) match {
- case None => MapFeatures(Scorable.NoSlug, json)
- case Some(title) => {
- val doi = Scorable.getString(map, "DOI")
- val authors: List[String] = mapToAuthorList(map)
- val year: Int = mapToYear(map).getOrElse(0)
- val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
- if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
- MapFeatures(Scorable.NoSlug, json)
- } else {
- val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
- MapFeatures(sf.toSlug, sf.toString)
- }
- }
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ doi=Scorable.getString(map, "DOI"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 899ce66..f4ed129 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -20,7 +20,7 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
GrobidScorable.getHBaseSource(args("hbase-table"), args("zookeeper-hosts"))
}
- def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
getSource(args)
.read
// Can't just "fromBytesWritable" because we have multiple types
@@ -65,16 +65,16 @@ object GrobidScorable {
HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
}
- def jsonToMapFeatures(key : String, json : String) : MapFeatures = {
+ def jsonToMapFeatures(key : String, json : String) : Option[MapFeatures] = {
Scorable.jsonToMap(json) match {
- case None => MapFeatures(Scorable.NoSlug, json)
+ case None => None
case Some(map) => {
if (map contains "title") {
val authors: List[String] = mapToAuthorList(map)
val title = Scorable.getString(map, "title")
ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures
} else {
- MapFeatures(Scorable.NoSlug, json)
+ None
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index 19b257f..3146a6c 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -40,11 +40,11 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) {
parsedGrobidRows.inc
GrobidScorable.jsonToMapFeatures(entry._1, entry._2)
}
- .filter { entry => Scorable.isValidSlug(entry.slug) }
- .map { entry =>
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => {
validGrobidRows.inc
- entry
- }
+ entry.get
+ }}
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple
diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
index 1578258..292de75 100644
--- a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
+++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
@@ -15,7 +15,8 @@ class MatchBenchmarkJob(args: Args) extends JobBase(args) {
val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
- pipe1.join(pipe2).map { entry =>
+ pipe1.join(pipe2)
+ .map { entry =>
val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
new ReduceOutput(
slug,
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index f7eb95d..5d67044 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -13,10 +13,12 @@ case class ReduceFeatures(json : String)
case class ReduceOutput(val slug : String, score : Int, json1 : String, json2 : String)
abstract class Scorable {
- def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] =
- {
- getFeaturesPipe(args)
- .filter { entry => Scorable.isValidSlug(entry.slug) }
+ def getInputPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[(String, ReduceFeatures)] = {
+ val validFeatures : TypedPipe[MapFeatures] = getFeaturesPipe(args)
+ .filterNot { entry => entry.isEmpty }
+ .map { entry => entry.get }
+
+ validFeatures
.groupBy { case MapFeatures(slug, json) => slug }
.map { tuple =>
val (slug : String, features : MapFeatures) = tuple
@@ -26,16 +28,11 @@ abstract class Scorable {
// abstract methods
def getSource(args : Args) : Source
- def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures]
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]]
}
object Scorable {
val MaxTitleLength = 1023
- val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
-
- def isValidSlug(slug : String) : Boolean = {
- slug != NoSlug
- }
def jsonToMap(json : String) : Option[Map[String, Any]] = {
// https://stackoverflow.com/a/32717262/631051
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 241db79..be2b495 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -35,20 +35,27 @@ class ScorableFeatures private(title : String, authors : List[Any] = List(), yea
JSONObject(toMap).toString
}
- def toSlug() : String = {
+ def toSlug() : Option[String] = {
if (title == null) {
- Scorable.NoSlug
+ None
} else {
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
if (slug.isEmpty
- || slug == null
- || (ScorableFeatures.SlugBlacklist contains slug)
- || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) {
+ None
+ } else {
+ Some(slug)
+ }
}
}
- def toMapFeatures : MapFeatures =
- MapFeatures(toSlug, toString)
+ def toMapFeatures : Option[MapFeatures] =
+ toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, toString))
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index c0d1cb0..8302b8f 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -79,59 +79,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
// Unit tests
"CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
- val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None)
}
it should "handle missing title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None)
}
it should "handle null title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None)
}
it should "handle empty title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None)
}
it should "handle subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshortjustright"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
}
it should "handle empty subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshort"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
}
it should "handle null subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshort"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
}
it should "handle missing authors" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None)
}
it should "handle valid input" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
- result.slug shouldBe "sometitle"
- Scorable.jsonToMap(result.json) match {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match {
case None => fail()
- case Some(map) => {
- map("title").asInstanceOf[String] shouldBe "Some Title"
- map("doi").asInstanceOf[String] shouldBe "10.123/abc"
- // TODO: full name? not just a string?
- map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
- map("year").asInstanceOf[Double].toInt shouldBe 2002
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
+ }
+ }
}
}
}
@@ -161,9 +166,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
}
it should "handle content types" in {
- val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
- resultWrong.slug shouldBe Scorable.NoSlug
- val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
- resultMissing.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None)
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None)
}
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 119cf90..b395a64 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -68,29 +68,30 @@ class GrobidScorableTest extends FlatSpec with Matchers {
// Unit tests
"GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None)
}
it should "handle null title" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None)
}
it should "handle missing title" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None)
}
it should "handle valid input" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
- result.slug shouldBe "dummyexamplefile"
- Scorable.jsonToMap(result.json) match {
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match {
case None => fail()
- case Some(map) => {
- map should contain key "title"
- map("title").asInstanceOf[String] shouldBe "Dummy Example File"
- map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+ case Some(result) => {
+ result.slug shouldBe "dummyexamplefile"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map should contain key "title"
+ map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+ }
+ }
}
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 450c169..112a5e5 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -8,60 +8,57 @@ import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
-
- private def titleToSlug(s : String) : String = {
- ScorableFeatures.create(title = s).toSlug
- }
-
"toMapFeatures()" should "work with gnarly inputs" in {
ScorableFeatures.create(title = null).toMapFeatures
ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
}
+ private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
+
"mapToSlug()" should "extract the parts of titles before a colon" in {
- titleToSlug("HELLO:there") shouldBe "hellothere"
+ titleToSlug("HELLO:there") shouldBe Some("hellothere")
}
it should "extract an entire colon-less string" in {
- titleToSlug("hello THERE") shouldBe "hellothere"
+ titleToSlug("hello THERE") shouldBe Some("hellothere")
}
it should "return Scorable.NoSlug if given empty string" in {
- titleToSlug("") shouldBe Scorable.NoSlug
+ titleToSlug("") shouldBe (None)
}
it should "return Scorable.NoSlug if given null" in {
- titleToSlug(null) shouldBe Scorable.NoSlug
+ titleToSlug(null) shouldBe (None)
}
it should "strip punctuation" in {
- titleToSlug("HELLO!:the:re") shouldBe "hellothere"
- titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
+ titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
+ titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
titleToSlug(
- "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
- titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
+ "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
+ titleToSlug(":;\"\'") shouldBe (None)
}
it should "filter stub titles" in {
- titleToSlug("abstract") shouldBe Scorable.NoSlug
- titleToSlug("title!") shouldBe Scorable.NoSlug
- titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
+ titleToSlug("abstract") shouldBe (None)
+ titleToSlug("title!") shouldBe (None)
+ titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist")
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug
- // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
- // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None)
+ // TODO: titleToSlug("©™₨№…") shouldBe (None)
+ // TODO: titleToSlug("πµΣσ") shouldBe (None)
}
it should "remove whitespace" in {
- titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
- titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
- titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
+ titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
+ titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
+ titleToSlug("\n \t \r ") shouldBe (None)
}
it should "skip very short slugs" in {
- titleToSlug("short") shouldBe Scorable.NoSlug
- titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+ titleToSlug("short") shouldBe (None)
+ titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 32fb16c..fbc0ee5 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -222,15 +222,18 @@ class ScoreJobTest extends FlatSpec with Matchers {
}
def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
- val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
+ val mfg : Option[MapFeatures] = GrobidScorable.jsonToMapFeatures(
Sha1Strings(grobidIndex),
JsonStrings(grobidIndex))
- val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
- CrossrefStrings(crossrefIndex))
- val score = Scorable.computeSimilarity(
- ReduceFeatures(mf1.json),
- ReduceFeatures(mf2.json))
- (slug, score, mf1.json, mf2.json)
+ val mfc : Option[MapFeatures] = CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex))
+ if (mfg.isEmpty || mfc.isEmpty) {
+ fail()
+ } else {
+ val score = Scorable.computeSimilarity(
+ ReduceFeatures(mfg.get.json),
+ ReduceFeatures(mfc.get.json))
+ (slug, score, mfg.get.json, mfc.get.json)
+ }
}
it should "have right output values" in {