aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala67
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala27
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala45
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala23
4 files changed, 85 insertions, 77 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index c0d1cb0..8302b8f 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -79,59 +79,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
// Unit tests
"CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
- val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None)
}
it should "handle missing title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None)
}
it should "handle null title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None)
}
it should "handle empty title" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None)
}
it should "handle subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshortjustright"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
}
it should "handle empty subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshort"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
}
it should "handle null subtitle" in {
- val result = CrossrefScorable.jsonToMapFeatures(
- """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
- result.slug shouldBe "shortbutnottooshort"
+ CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
}
it should "handle missing authors" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
- result.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None)
}
it should "handle valid input" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
- result.slug shouldBe "sometitle"
- Scorable.jsonToMap(result.json) match {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match {
case None => fail()
- case Some(map) => {
- map("title").asInstanceOf[String] shouldBe "Some Title"
- map("doi").asInstanceOf[String] shouldBe "10.123/abc"
- // TODO: full name? not just a string?
- map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
- map("year").asInstanceOf[Double].toInt shouldBe 2002
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
+ }
+ }
}
}
}
@@ -161,9 +166,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
}
it should "handle content types" in {
- val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
- resultWrong.slug shouldBe Scorable.NoSlug
- val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
- resultMissing.slug shouldBe Scorable.NoSlug
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None)
+ CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None)
}
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 119cf90..b395a64 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -68,29 +68,30 @@ class GrobidScorableTest extends FlatSpec with Matchers {
// Unit tests
"GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None)
}
it should "handle null title" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None)
}
it should "handle missing title" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
- result.slug shouldBe Scorable.NoSlug
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None)
}
it should "handle valid input" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
- result.slug shouldBe "dummyexamplefile"
- Scorable.jsonToMap(result.json) match {
+ GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match {
case None => fail()
- case Some(map) => {
- map should contain key "title"
- map("title").asInstanceOf[String] shouldBe "Dummy Example File"
- map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+ case Some(result) => {
+ result.slug shouldBe "dummyexamplefile"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map should contain key "title"
+ map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
+ }
+ }
}
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 450c169..3f6b87c 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -8,60 +8,57 @@ import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
-
- private def titleToSlug(s : String) : String = {
- ScorableFeatures.create(title = s).toSlug
- }
-
"toMapFeatures()" should "work with gnarly inputs" in {
ScorableFeatures.create(title = null).toMapFeatures
ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures
}
+ private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug
+
"mapToSlug()" should "extract the parts of titles before a colon" in {
- titleToSlug("HELLO:there") shouldBe "hellothere"
+ titleToSlug("HELLO:there") shouldBe (Some("hellothere"))
}
it should "extract an entire colon-less string" in {
- titleToSlug("hello THERE") shouldBe "hellothere"
+ titleToSlug("hello THERE") shouldBe (Some("hellothere"))
}
it should "return Scorable.NoSlug if given empty string" in {
- titleToSlug("") shouldBe Scorable.NoSlug
+ titleToSlug("") shouldBe (None)
}
it should "return Scorable.NoSlug if given null" in {
- titleToSlug(null) shouldBe Scorable.NoSlug
+ titleToSlug(null) shouldBe (None)
}
it should "strip punctuation" in {
- titleToSlug("HELLO!:the:re") shouldBe "hellothere"
- titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
+ titleToSlug("HELLO!:the:re") shouldBe Some("hellothere")
+ titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh")
titleToSlug(
- "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
- titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
+ "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands")
+ titleToSlug(":;\"\'") shouldBe (None)
}
it should "filter stub titles" in {
- titleToSlug("abstract") shouldBe Scorable.NoSlug
- titleToSlug("title!") shouldBe Scorable.NoSlug
- titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
+ titleToSlug("abstract") shouldBe (None)
+ titleToSlug("title!") shouldBe (None)
+ titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist")
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug
- // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
- // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None)
+ // TODO: titleToSlug("©™₨№…") shouldBe (None)
+ // TODO: titleToSlug("πµΣσ") shouldBe (None)
}
it should "remove whitespace" in {
- titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
- titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
- titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
+ titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz")
+ titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi")
+ titleToSlug("\n \t \r ") shouldBe (None)
}
it should "skip very short slugs" in {
- titleToSlug("short") shouldBe Scorable.NoSlug
- titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+ titleToSlug("short") shouldBe (None)
+ titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle")
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 32fb16c..c3e4ff9 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -222,15 +222,22 @@ class ScoreJobTest extends FlatSpec with Matchers {
}
def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
- val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
+ GrobidScorable.jsonToMapFeatures(
Sha1Strings(grobidIndex),
- JsonStrings(grobidIndex))
- val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
- CrossrefStrings(crossrefIndex))
- val score = Scorable.computeSimilarity(
- ReduceFeatures(mf1.json),
- ReduceFeatures(mf2.json))
- (slug, score, mf1.json, mf2.json)
+ JsonStrings(grobidIndex)) match {
+ case None => fail()
+ case Some(mf1) => {
+ CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) match {
+ case None => fail()
+ case Some(mf2) => {
+ val score = Scorable.computeSimilarity(
+ ReduceFeatures(mf1.json),
+ ReduceFeatures(mf2.json))
+ (slug, score, mf1.json, mf2.json)
+ }
+ }
+ }
+ }
}
it should "have right output values" in {