diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-08-22 12:59:17 -0700 |
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-22 12:59:17 -0700 |
| commit | 7087760028cb55b8290783fc8c28958108f00f36 (patch) | |
| tree | c51a9f2a00f5d2da80ca64c05677e96a9c901436 /scalding/src/main/scala | |
| parent | b628b7026ab8e7abf4beeaaad99d831b49578483 (diff) | |
| download | sandcrawler-7087760028cb55b8290783fc8c28958108f00f36.tar.gz sandcrawler-7087760028cb55b8290783fc8c28958108f00f36.zip | |
add more punctuation characters to slug filter
Diffstat (limited to 'scalding/src/main/scala')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/StringUtilities.scala | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 2745875..e03b60d 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -36,7 +36,7 @@ object StringUtilities { // Source: https://stackoverflow.com/a/30076541/631051 def removePunctuation(s: String) : String = { - s.replaceAll("""[\p{Punct}]""", "") + s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "") } // Adapted from: https://stackoverflow.com/a/16018452/631051 |
