aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test')
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala57
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala37
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala27
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala32
5 files changed, 123 insertions, 36 deletions
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1789d1a..f598cae 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -64,12 +64,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
"issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
"subject" : [ "Pediatrics, Perinatology, and Child Health" ]
}
-"""
+""".replace("<<DOI>>", "10.123/aBc")
// scalastyle:on
- val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
+ val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+ val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
// Unit tests
"CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
@@ -82,19 +88,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle null title" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle empty title" in {
val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle missing authors" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle valid input" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
result.slug shouldBe "sometitle"
Scorable.jsonToMap(result.json) match {
case None => fail()
case Some(map) => {
map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
}
}
}
+
+ "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "handle content types" in {
+ val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
+ resultWrong.slug shouldBe Scorable.NoSlug
+ val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
+ resultMissing.slug shouldBe Scorable.NoSlug
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
index 12e13dc..bf9343b 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -78,14 +78,14 @@ class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
"sha1:024937534094897039547e9824382943") // bad status
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2")
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
)
// bnewbold: status codes aren't strings, they are uint64
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 661824b..119cf90 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,7 +57,10 @@ class GrobidScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
- val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
val MalformedGrobidString = GrobidString.replace("}", "")
val Key = "Dummy Key"
@@ -69,20 +72,50 @@ class GrobidScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle null title" in {
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle missing title" in {
val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
result.slug shouldBe Scorable.NoSlug
}
it should "handle valid input" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
result.slug shouldBe "dummyexamplefile"
Scorable.jsonToMap(result.json) match {
case None => fail()
case Some(map) => {
map should contain key "title"
map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
}
}
}
+
+ "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
+ GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 5a22ef8..474f69a 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -9,22 +9,6 @@ import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
- // TODO: Remove this when we're convinced that our file-reading code
- // works. (I'm already convinced. --Ellen)
- "read slugs" should "work" in {
- val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
- "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
- "casereport", "commentary", "commentaryon", "commenton", "commentto",
- "contents", "correspondence", "dedication", "editorialadvisoryboard",
- "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
- "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
- "references", "results", "review", "reviewarticle", "summary", "title",
- "name")
-
- ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size
- for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s)
- }
-
private def titleToSlug(s : String) : String = {
ScorableFeatures.create(title = s).toSlug
}
@@ -52,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
it should "strip punctuation" in {
titleToSlug("HELLO!:the:re") shouldBe "hellothere"
- titleToSlug("a:b:c") shouldBe "abc"
+ titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
titleToSlug(
"If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -65,14 +49,19 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
}
it should "remove whitespace" in {
titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
- titleToSlug("\na\t:b:c") shouldBe "abc"
+ titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
}
+
+ it should "skip very short slugs" in {
+ titleToSlug("short") shouldBe Scorable.NoSlug
+ titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 35c31e5..32fb16c 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -117,6 +117,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
}
"""
// scalastyle:on
+ val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string
+ val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -124,7 +126,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
- CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
+ CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
// Pipeline tests
val output = "/tmp/testOutput"
@@ -137,23 +141,28 @@ class ScoreJobTest extends FlatSpec with Matchers {
"sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
"sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
"sha1:93187A85273589347598473894839443",
- "sha1:024937534094897039547e9824382943")
+ "sha1:024937534094897039547e9824382943",
+ "sha1:93229759932857982837892347893892",
+ "sha1:83229759932857982837892347893892")
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2")
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+ // These are in both sources but have bad titles
+ JsonString.replace("<<TITLE>>", TooLongOfTitle),
+ JsonString.replace("<<TITLE>>", TooShortOfTitle)
)
// bnewbold: status codes aren't strings, they are uint64
val Ok : Long = 200
val Bad : Long = 400
- val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
.zipped
@@ -181,19 +190,24 @@ class ScoreJobTest extends FlatSpec with Matchers {
0 -> CrossrefStrings(0),
1 -> CrossrefStrings(1),
2 -> CrossrefStrings(2),
- 3 -> CrossrefStrings(3)))
- .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) {
- _ => () }
+ 3 -> CrossrefStrings(3),
+ 4 -> CrossrefStrings(4),
+ 4 -> CrossrefStrings(5)))
+ .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
// Grobid titles and slugs (in parentheses):
// Title 1 (title1)
// Title 2: TNG (title2tng)
// Title 3: The Sequel (title3thesequel)
+ // <too long of a title>
+ // <too short of a title>
// crossref titles and slugs (in parentheses):
// Title 2: TNG (title2tng)
// Title 1: TNG 2A (title1tng2a)
// Title 1: TNG 3 (title1tng3)
// Title 2: Rebooted (title2rebooted)
+ // <too long of a title>
+ // <too short of a title>
// XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
outputBuffer =>
"The pipeline" should "return a 1-element list" in {