diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 12:52:03 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 12:52:03 -0700 |
commit | f54e47ace6dd041e78e10ee25573c6ad3de808eb (patch) | |
tree | 21931c4efd9997d69227623aba0cff224907dd44 /scalding/src/test/scala | |
parent | b628b7026ab8e7abf4beeaaad99d831b49578483 (diff) | |
download | sandcrawler-f54e47ace6dd041e78e10ee25573c6ad3de808eb.tar.gz sandcrawler-f54e47ace6dd041e78e10ee25573c6ad3de808eb.zip |
Added title length filtering to GrobidScorable
Diffstat (limited to 'scalding/src/test/scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala | 31 |
1 files changed, 29 insertions, 2 deletions
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 661824b..620998e 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -57,7 +57,9 @@ class GrobidScorableTest extends FlatSpec with Matchers { "annex": null } """ - val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File") + val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") + val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null") val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") val Key = "Dummy Key" @@ -69,13 +71,18 @@ class GrobidScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle null title" in { + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle) + result.slug shouldBe Scorable.NoSlug + } + it should "handle missing title" in { val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle) result.slug shouldBe Scorable.NoSlug } it should "handle valid input" in { - val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle) + val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle) result.slug shouldBe "dummyexamplefile" Scorable.jsonToMap(result.json) match { case None => fail() @@ -85,4 +92,24 @@ class GrobidScorableTest extends FlatSpec with Matchers { } } } + + "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in { + GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true + } + + it should "return false for valid JSON with excessively long title" in { + GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false + } + + it should "return false for valid JSON with null title" in { + GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false + } + + it should "return false for valid JSON with no title" in { + GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false + } + + it should "return false for invalid JSON" in { + GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false + } } |