aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-22 14:50:05 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-22 14:50:05 -0700
commit31a71c166c8452ce16d443697c33545577fa35f3 (patch)
tree8ea767a93d17a90ec2f2ca2964a43e56ff977215 /scalding/src/main
parent9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33 (diff)
parent03968da99d24d81e0224712056d1dea38cb8c70e (diff)
downloadsandcrawler-31a71c166c8452ce16d443697c33545577fa35f3.tar.gz
sandcrawler-31a71c166c8452ce16d443697c33545577fa35f3.zip
Merge branch 'master' into ellen-length-filtering
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala2
1 files changed, 1 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 2745875..e03b60d 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -36,7 +36,7 @@ object StringUtilities {
// Source: https://stackoverflow.com/a/30076541/631051
def removePunctuation(s: String) : String = {
- s.replaceAll("""[\p{Punct}]""", "")
+ s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "")
}
// Adapted from: https://stackoverflow.com/a/16018452/631051