diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:33:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:33:09 -0700 |
commit | 3f668933d71b82555e89a3bfefe83039ff7ddbfb (patch) | |
tree | 619ec9976ccde6594cd461e0acee3c34f173ed6b /scalding/src/test/scala | |
parent | 70350899dda973cdf7a5cfdd941ae80319254587 (diff) | |
download | sandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.tar.gz sandcrawler-3f668933d71b82555e89a3bfefe83039ff7ddbfb.zip |
add a stub title blacklist
Diffstat (limited to 'scalding/src/test/scala')
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index fd01c91..0acf0b8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -37,6 +37,12 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { titleToSlug(":;\"\'") shouldBe Scorable.NoSlug } + it should "filter stub titles" in { + titleToSlug("abstract") shouldBe Scorable.NoSlug + titleToSlug("title!") shouldBe Scorable.NoSlug + titleToSlug("a real title which is not on blacklist") shouldBe "arealtitlewhichisnotonblacklist" + } + it should "strip special characters" in { titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_…") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№") shouldBe Scorable.NoSlug |