diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 15:04:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 15:04:55 -0700 |
commit | f8a0c99b270ebcd6e239c6e26190cf7200389ced (patch) | |
tree | 300e311cf645c8c78035aa122e7118a7603ee841 /scalding/src | |
parent | 432dfd186c2d626c805dc6f7ae7d4e1993638430 (diff) | |
download | sandcrawler-f8a0c99b270ebcd6e239c6e26190cf7200389ced.tar.gz sandcrawler-f8a0c99b270ebcd6e239c6e26190cf7200389ced.zip |
more special characters to strip
Diffstat (limited to 'scalding/src')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/StringUtilities.scala | 2 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 2 |
2 files changed, 2 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index e03b60d..9150ced 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -36,7 +36,7 @@ object StringUtilities { // Source: https://stackoverflow.com/a/30076541/631051 def removePunctuation(s: String) : String = { - s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "") + s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」¿–±§ʿ]""", "") } // Adapted from: https://stackoverflow.com/a/16018452/631051 diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 474f69a..450c169 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -49,7 +49,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } |