From 3f668933d71b82555e89a3bfefe83039ff7ddbfb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Aug 2018 22:33:09 -0700 Subject: add a stub title blacklist --- scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'scalding/src/test') diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index fd01c91..0acf0b8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { titleToSlug(":;\"\'") shouldBe Scorable.NoSlug } + it should "filter stub titles" in { + titleToSlug("abstract") shouldBe Scorable.NoSlug + titleToSlug("title!") shouldBe Scorable.NoSlug + titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + } + it should "strip special characters" in { titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug -- cgit v1.2.3