aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CONTRIBUTORS4
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala2
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala2
3 files changed, 6 insertions, 2 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..f6dea1c
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,4 @@
+Bryan Newbold
+
+Ellen Spertus transfers copyright of all of her contributions to the
+repository in exchange for one Internet Archive Sticker, received. \ No newline at end of file
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 2745875..e03b60d 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -36,7 +36,7 @@ object StringUtilities {
// Source: https://stackoverflow.com/a/30076541/631051
def removePunctuation(s: String) : String = {
- s.replaceAll("""[\p{Punct}]""", "")
+ s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "")
}
// Adapted from: https://stackoverflow.com/a/16018452/631051
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 9459749..d742384 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -49,7 +49,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
}