diff options
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala new file mode 100644 index 0000000..80d92aa --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -0,0 +1,57 @@ +package sandcrawler + +import org.scalatest._ + +// scalastyle:off null +class ScorableFeaturesTest extends FlatSpec with Matchers { + private def titleToSlug(s : String) : String = { + new ScorableFeatures(title = s).toSlug + } + + "toMapFeatures()" should "work with gnarly inputs" in { + new ScorableFeatures(title = null).toMapFeatures + new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures + } + + "mapToSlug()" should "extract the parts of titles before a colon" in { + titleToSlug("HELLO:there") shouldBe "hellothere" + } + + it should "extract an entire colon-less string" in { + titleToSlug("hello THERE") shouldBe "hellothere" + } + + it should "return Scorable.NoSlug if given empty string" in { + titleToSlug("") shouldBe Scorable.NoSlug + } + + it should "return Scorable.NoSlug if given null" in { + titleToSlug(null) shouldBe Scorable.NoSlug + } + + it should "strip punctuation" in { + titleToSlug("HELLO!:the:re") shouldBe "hellothere" + titleToSlug("a:b:c") shouldBe "abc" + titleToSlug( + "If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands" + titleToSlug(":;\"\'") shouldBe Scorable.NoSlug + } + + it should "filter stub titles" in { + titleToSlug("abstract") shouldBe Scorable.NoSlug + titleToSlug("title!") shouldBe Scorable.NoSlug + titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + } + + it should "strip special characters" in { + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug + // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug + // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug + } + + it should "remove whitespace" in { + titleToSlug("foo bar : baz ::") shouldBe "foobarbaz" + titleToSlug("\na\t:b:c") shouldBe "abc" + titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug + } +} |