diff options
4 files changed, 85 insertions, 77 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index c0d1cb0..8302b8f 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -79,59 +79,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers { // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(MalformedCrossrefString) should be (None) } it should "handle missing title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithoutTitle) should be (None) } it should "handle null title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) should be (None) } it should "handle empty title" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) should be (None) } it should "handle subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshortjustright" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article","author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshortjustright" + } } it should "handle empty subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshort" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } } it should "handle null subtitle" in { - val result = CrossrefScorable.jsonToMapFeatures( - """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") - result.slug shouldBe "shortbutnottooshort" + CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } } it should "handle missing authors" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) - result.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) should be (None) } it should "handle valid input" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) - result.slug shouldBe "sometitle" - Scorable.jsonToMap(result.json) match { + CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) match { case None => fail() - case Some(map) => { - map("title").asInstanceOf[String] shouldBe "Some Title" - map("doi").asInstanceOf[String] shouldBe "10.123/abc" - // TODO: full name? not just a string? - map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") - map("year").asInstanceOf[Double].toInt shouldBe 2002 + case Some(result) => { + result.slug shouldBe "sometitle" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map("title").asInstanceOf[String] shouldBe "Some Title" + map("doi").asInstanceOf[String] shouldBe "10.123/abc" + // TODO: full name? not just a string? + map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") + map("year").asInstanceOf[Double].toInt shouldBe 2002 + } + } } } } @@ -161,9 +166,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { } it should "handle content types" in { - val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) - resultWrong.slug shouldBe Scorable.NoSlug - val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) - resultMissing.slug shouldBe Scorable.NoSlug + CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) should be (None) + CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) should be (None) } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 119cf90..b395a64 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -68,29 +68,30 @@ class GrobidScorableTest extends FlatSpec with Matchers { // Unit tests "GrobidScorable.jsonToMapFeatures()" should "handle invalid JSON" in { - val result = GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, MalformedGrobidString) should be (None) } it should "handle null title" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) should be (None) } it should "handle missing title" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) - result.slug shouldBe Scorable.NoSlug + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) should be (None) } it should "handle valid input" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) - result.slug shouldBe "dummyexamplefile" - Scorable.jsonToMap(result.json) match { + GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) match { case None => fail() - case Some(map) => { - map should contain key "title" - map("title").asInstanceOf[String] shouldBe "Dummy Example File" - map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") + case Some(result) => { + result.slug shouldBe "dummyexamplefile" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map should contain key "title" + map("title").asInstanceOf[String] shouldBe "Dummy Example File" + map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") + } + } } } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 450c169..3f6b87c 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -8,60 +8,57 @@ import org.scalatest._ // scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { - - private def titleToSlug(s : String) : String = { - ScorableFeatures.create(title = s).toSlug - } - "toMapFeatures()" should "work with gnarly inputs" in { ScorableFeatures.create(title = null).toMapFeatures ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures } + private def titleToSlug(s : String) : Option[String] = ScorableFeatures.create(title = s).toSlug + "mapToSlug()" should "extract the parts of titles before a colon" in { - titleToSlug("HELLO:there") shouldBe "hellothere" + titleToSlug("HELLO:there") shouldBe (Some("hellothere")) } it should "extract an entire colon-less string" in { - titleToSlug("hello THERE") shouldBe "hellothere" + titleToSlug("hello THERE") shouldBe (Some("hellothere")) } it should "return Scorable.NoSlug if given empty string" in { - titleToSlug("") shouldBe Scorable.NoSlug + titleToSlug("") shouldBe (None) } it should "return Scorable.NoSlug if given null" in { - titleToSlug(null) shouldBe Scorable.NoSlug + titleToSlug(null) shouldBe (None) } it should "strip punctuation" in { - titleToSlug("HELLO!:the:re") shouldBe "hellothere" - titleToSlug("a:b:cdefgh") shouldBe "abcdefgh" + titleToSlug("HELLO!:the:re") shouldBe Some("hellothere") + titleToSlug("a:b:cdefgh") shouldBe Some("abcdefgh") titleToSlug( - "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" - titleToSlug(":;\"\'") shouldBe Scorable.NoSlug + "If you're happy and you know it, clap your hands!") shouldBe Some("ifyourehappyandyouknowitclapyourhands") + titleToSlug(":;\"\'") shouldBe (None) } it should "filter stub titles" in { - titleToSlug("abstract") shouldBe Scorable.NoSlug - titleToSlug("title!") shouldBe Scorable.NoSlug - titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + titleToSlug("abstract") shouldBe (None) + titleToSlug("title!") shouldBe (None) + titleToSlug("a real title which is not on blacklist") shouldBe Some("arealtitlewhichisnotonblacklist") } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug - // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug - // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe (None) + // TODO: titleToSlug("©™₨№…") shouldBe (None) + // TODO: titleToSlug("πµΣσ") shouldBe (None) } it should "remove whitespace" in { - titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" - titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi" - titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug + titleToSlug("foo bar : baz ::") shouldBe Some("foobarbaz") + titleToSlug("\na\t:b:cdefghi") shouldBe Some("abcdefghi") + titleToSlug("\n \t \r ") shouldBe (None) } it should "skip very short slugs" in { - titleToSlug("short") shouldBe Scorable.NoSlug - titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle" + titleToSlug("short") shouldBe (None) + titleToSlug("a longer, more in depth title") shouldBe Some("alongermoreindepthtitle") } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 32fb16c..c3e4ff9 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -222,15 +222,22 @@ class ScoreJobTest extends FlatSpec with Matchers { } def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = { - val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures( + GrobidScorable.jsonToMapFeatures( Sha1Strings(grobidIndex), - JsonStrings(grobidIndex)) - val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures( - CrossrefStrings(crossrefIndex)) - val score = Scorable.computeSimilarity( - ReduceFeatures(mf1.json), - ReduceFeatures(mf2.json)) - (slug, score, mf1.json, mf2.json) + JsonStrings(grobidIndex)) match { + case None => fail() + case Some(mf1) => { + CrossrefScorable.jsonToMapFeatures(CrossrefStrings(crossrefIndex)) match { + case None => fail() + case Some(mf2) => { + val score = Scorable.computeSimilarity( + ReduceFeatures(mf1.json), + ReduceFeatures(mf2.json)) + (slug, score, mf1.json, mf2.json) + } + } + } + } } it should "have right output values" in { |