diff options
Diffstat (limited to 'scalding/src/test')
5 files changed, 123 insertions, 36 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 1789d1a..f598cae 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -64,12 +64,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers { "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } -""" +""".replace("<<DOI>>", "10.123/aBc") // scalastyle:on - val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title") + val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title") + val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) + val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") + val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null") val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") + val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author") + val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other") + val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type") // Unit tests "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { @@ -82,19 +88,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle null title" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle) + result.slug shouldBe Scorable.NoSlug + } + it should "handle empty title" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle) result.slug shouldBe Scorable.NoSlug } + it should "handle missing authors" in { + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) + result.slug shouldBe Scorable.NoSlug + } + it should "handle valid input" in { - val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle) + val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle) result.slug shouldBe "sometitle" Scorable.jsonToMap(result.json) match { case None => fail() case Some(map) => { map("title").asInstanceOf[String] shouldBe "Some Title" + map("doi").asInstanceOf[String] shouldBe "10.123/abc" + // TODO: full name? not just a string? + map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") + map("year").asInstanceOf[Double].toInt shouldBe 2002 } } } + + "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in { + CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true + } + + it should "return true for valid JSON with a title of maximum permitted length" in { + CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true + } + + it should "return false for valid JSON with excessively long title" in { + CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false + } + + it should "return false for valid JSON with null title" in { + CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false + } + + it should "return false for valid JSON with no title" in { + CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false + } + + it should "return false for invalid JSON" in { + CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false + } + + it should "handle content types" in { + val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) + resultWrong.slug shouldBe Scorable.NoSlug + val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) + resultMissing.slug shouldBe Scorable.NoSlug + } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala index 12e13dc..bf9343b 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala @@ -78,14 +78,14 @@ class GrobidScorableDumpJobTest extends FlatSpec with Matchers { "sha1:024937534094897039547e9824382943") // bad status val JsonStrings : List[String] = List( - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Classic"), JsonString.replace("<<TITLE>>", "Title 2: TNG"), JsonString.replace("<<TITLE>>", "Title 3: The Sequel"), // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Classic"), MalformedJsonString, // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 2") + JsonString.replace("<<TITLE>>", "Title 2: Not TNG") ) // bnewbold: status codes aren't strings, they are uint64 diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 661824b..119cf90 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,7 +57,10 @@ class GrobidScorableTest extends FlatSpec with Matchers { "annex": null } """ - val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) + val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") + val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null") val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") val Key = "Dummy Key" @@ -69,20 +72,50 @@ class GrobidScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle null title" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) + result.slug shouldBe Scorable.NoSlug + } + it should "handle missing title" in { val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) result.slug shouldBe Scorable.NoSlug } it should "handle valid input" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) result.slug shouldBe "dummyexamplefile" Scorable.jsonToMap(result.json) match { case None => fail() case Some(map) => { map should contain key "title" map("title").asInstanceOf[String] shouldBe "Dummy Example File" + map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") } } } + + "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in { + GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true + } + + it should "return true for valid JSON with a title of maximum permitted length" in { + GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true + } + + it should "return false for valid JSON with excessively long title" in { + GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false + } + + it should "return false for valid JSON with null title" in { + GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false + } + + it should "return false for valid JSON with no title" in { + GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false + } + + it should "return false for invalid JSON" in { + GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false + } } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 5a22ef8..474f69a 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -9,22 +9,6 @@ import org.scalatest._ // scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { - // TODO: Remove this when we're convinced that our file-reading code - // works. (I'm already convinced. --Ellen) - "read slugs" should "work" in { - val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", - "article", "authorreply", "authorsreply", "bookreview", "bookreviews", - "casereport", "commentary", "commentaryon", "commenton", "commentto", - "contents", "correspondence", "dedication", "editorialadvisoryboard", - "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", - "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", - "references", "results", "review", "reviewarticle", "summary", "title", - "name") - - ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size - for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s) - } - private def titleToSlug(s : String) : String = { ScorableFeatures.create(title = s).toSlug } @@ -52,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { it should "strip punctuation" in { titleToSlug("HELLO!:the:re") shouldBe "hellothere" - titleToSlug("a:b:c") shouldBe "abc" + titleToSlug("a:b:cdefgh") shouldBe "abcdefgh" titleToSlug( "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" titleToSlug(":;\"\'") shouldBe Scorable.NoSlug @@ -65,14 +49,19 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } it should "remove whitespace" in { titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" - titleToSlug("\na\t:b:c") shouldBe "abc" + titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi" titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug } + + it should "skip very short slugs" in { + titleToSlug("short") shouldBe Scorable.NoSlug + titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle" + } } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 35c31e5..32fb16c 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -117,6 +117,8 @@ class ScoreJobTest extends FlatSpec with Matchers { } """ // scalastyle:on + val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string + val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1) val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -124,7 +126,9 @@ class ScoreJobTest extends FlatSpec with Matchers { CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), - CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")) + CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"), + CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"), + CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1")) // Pipeline tests val output = "/tmp/testOutput" @@ -137,23 +141,28 @@ class ScoreJobTest extends FlatSpec with Matchers { "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "sha1:93187A85273589347598473894839443", - "sha1:024937534094897039547e9824382943") + "sha1:024937534094897039547e9824382943", + "sha1:93229759932857982837892347893892", + "sha1:83229759932857982837892347893892") val JsonStrings : List[String] = List( - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Original"), JsonString.replace("<<TITLE>>", "Title 2: TNG"), JsonString.replace("<<TITLE>>", "Title 3: The Sequel"), // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 1"), + JsonString.replace("<<TITLE>>", "Title 1: The Original"), MalformedJsonString, // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 2") + JsonString.replace("<<TITLE>>", "Title 2: Not TNG"), + // These are in both sources but have bad titles + JsonString.replace("<<TITLE>>", TooLongOfTitle), + JsonString.replace("<<TITLE>>", TooShortOfTitle) ) // bnewbold: status codes aren't strings, they are uint64 val Ok : Long = 200 val Bad : Long = 400 - val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad) + val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok) val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes) .zipped @@ -181,19 +190,24 @@ class ScoreJobTest extends FlatSpec with Matchers { 0 -> CrossrefStrings(0), 1 -> CrossrefStrings(1), 2 -> CrossrefStrings(2), - 3 -> CrossrefStrings(3))) - .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { - _ => () } + 3 -> CrossrefStrings(3), + 4 -> CrossrefStrings(4), + 4 -> CrossrefStrings(5))) + .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () } .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { // Grobid titles and slugs (in parentheses): // Title 1 (title1) // Title 2: TNG (title2tng) // Title 3: The Sequel (title3thesequel) + // <too long of a title> + // <too short of a title> // crossref titles and slugs (in parentheses): // Title 2: TNG (title2tng) // Title 1: TNG 2A (title1tng2a) // Title 1: TNG 3 (title1tng3) // Title 2: Rebooted (title2rebooted) + // <too long of a title> + // <too short of a title> // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug outputBuffer => "The pipeline" should "return a 1-element list" in { |