aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 22:43:33 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 22:43:33 -0700
commit96ea0ddd06ee4a7c11c7d5def976749ab3675878 (patch)
tree279382cc39355475c8a93f5ca3efcfb05b26fa57 /scalding/src/main/scala/sandcrawler
parent2277c2f793a007fa3a347af23fca35f4a3eafeef (diff)
downloadsandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.tar.gz
sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.zip
change slugification behavior to not split on colon
Diffstat (limited to 'scalding/src/main/scala/sandcrawler')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala4
1 files changed, 2 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 696b2ef..8ed3369 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
Scorable.NoSlug
} else {
val unaccented = StringUtilities.removeAccents(title)
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+ // Remove punctuation
+ val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
}
}