diff options
-rw-r--r-- | CONTRIBUTORS | 4 | ||||
-rw-r--r-- | scalding/src/main/scala/sandcrawler/StringUtilities.scala | 2 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala | 2 |
3 files changed, 6 insertions, 2 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..f6dea1c --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,4 @@ +Bryan Newbold + +Ellen Spertus transfers copyright of all of her contributions to the +repository in exchange for one Internet Archive Sticker, received.
\ No newline at end of file diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 2745875..e03b60d 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -36,7 +36,7 @@ object StringUtilities { // Source: https://stackoverflow.com/a/30076541/631051 def removePunctuation(s: String) : String = { - s.replaceAll("""[\p{Punct}]""", "") + s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "") } // Adapted from: https://stackoverflow.com/a/16018452/631051 diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 9459749..d742384 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -49,7 +49,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers { } it should "strip special characters" in { - titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug + titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug // TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug // TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug } |