aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:33:09 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:33:09 -0700
commit3f668933d71b82555e89a3bfefe83039ff7ddbfb (patch)
tree619ec9976ccde6594cd461e0acee3c34f173ed6b /scalding/src/test
parent70350899dda973cdf7a5cfdd941ae80319254587 (diff)
downloadsandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.tar.gz
sandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.zip
add a stub title blacklist
Diffstat (limited to 'scalding/src/test')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala6
1 files changed, 6 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index fd01c91..0acf0b8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
}
+ it should "filter stub titles" in {
+ titleToSlug("abstract") shouldBe Scorable.NoSlug
+ titleToSlug("title!") shouldBe Scorable.NoSlug
+ titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist"
+ }
+
it should "strip special characters" in {
titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug