aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-27 15:04:04 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-27 15:04:55 -0700
commitf8a0c99b270ebcd6e239c6e26190cf7200389ced (patch)
tree300e311cf645c8c78035aa122e7118a7603ee841
parent432dfd186c2d626c805dc6f7ae7d4e1993638430 (diff)
downloadsandcrawler-f8a0c99b270ebcd6e239c6e26190cf7200389ced.tar.gz
sandcrawler-f8a0c99b270ebcd6e239c6e26190cf7200389ced.zip
more special characters to strip
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala2
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala2
2 files changed, 2 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index e03b60d..9150ced 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -36,7 +36,7 @@ object StringUtilities {
// Source: https://stackoverflow.com/a/30076541/631051
def removePunctuation(s: String) : String = {
- s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "")
+ s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」¿–±§ʿ]""", "")
}
// Adapted from: https://stackoverflow.com/a/16018452/631051
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 474f69a..450c169 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -49,7 +49,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」¿–±§ʿ") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
}