diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 15:08:48 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 15:08:48 -0700 |
commit | 7a806a7841c911871aeb13fcb1eac41a108d1f6d (patch) | |
tree | e415d4fb88c4ed983ee810c70af7325fd1c7eb7f /scalding/src/test/scala | |
parent | 31a71c166c8452ce16d443697c33545577fa35f3 (diff) | |
download | sandcrawler-7a806a7841c911871aeb13fcb1eac41a108d1f6d.tar.gz sandcrawler-7a806a7841c911871aeb13fcb1eac41a108d1f6d.zip |
Added ScoreJob test for title-length filtering.
Diffstat (limited to 'scalding/src/test/scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScoreJobTest.scala | 18 |
1 files changed, 13 insertions, 5 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 35c31e5..0f3c09e 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -117,6 +117,7 @@ class ScoreJobTest extends FlatSpec with Matchers { } """ // scalastyle:on + val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle") val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") val MalformedCrossrefString = CrossrefString.replace("}", "") @@ -124,7 +125,8 @@ class ScoreJobTest extends FlatSpec with Matchers { CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"), CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"), - CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")) + CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"), + CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1")) // Pipeline tests val output = "/tmp/testOutput" @@ -137,7 +139,8 @@ class ScoreJobTest extends FlatSpec with Matchers { "sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "sha1:93187A85273589347598473894839443", - "sha1:024937534094897039547e9824382943") + "sha1:024937534094897039547e9824382943", + "sha1:93229759932857982837892347893892") val JsonStrings : List[String] = List( JsonString.replace("<<TITLE>>", "Title 1"), @@ -147,13 +150,15 @@ class ScoreJobTest extends FlatSpec with Matchers { JsonString.replace("<<TITLE>>", "Title 1"), MalformedJsonString, // This will have bad status. - JsonString.replace("<<TITLE>>", "Title 2") + JsonString.replace("<<TITLE>>", "Title 2"), + // This is in both sources but too long. + JsonString.replace("<<TITLE>>", TooLongOfTitle) ) // bnewbold: status codes aren't strings, they are uint64 val Ok : Long = 200 val Bad : Long = 400 - val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad) + val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok) val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes) .zipped @@ -181,7 +186,8 @@ class ScoreJobTest extends FlatSpec with Matchers { 0 -> CrossrefStrings(0), 1 -> CrossrefStrings(1), 2 -> CrossrefStrings(2), - 3 -> CrossrefStrings(3))) + 3 -> CrossrefStrings(3), + 4 -> CrossrefStrings(4))) .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () } .sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) { @@ -189,11 +195,13 @@ class ScoreJobTest extends FlatSpec with Matchers { // Title 1 (title1) // Title 2: TNG (title2tng) // Title 3: The Sequel (title3thesequel) + // <too long of a title> // crossref titles and slugs (in parentheses): // Title 2: TNG (title2tng) // Title 1: TNG 2A (title1tng2a) // Title 1: TNG 3 (title1tng3) // Title 2: Rebooted (title2rebooted) + // <too long of a title> // XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug outputBuffer => "The pipeline" should "return a 1-element list" in { |