diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:43:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-15 22:43:33 -0700 |
commit | 96ea0ddd06ee4a7c11c7d5def976749ab3675878 (patch) | |
tree | 279382cc39355475c8a93f5ca3efcfb05b26fa57 /scalding/src/main/scala | |
parent | 2277c2f793a007fa3a347af23fca35f4a3eafeef (diff) | |
download | sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.tar.gz sandcrawler-96ea0ddd06ee4a7c11c7d5def976749ab3675878.zip |
change slugification behavior to not split on colon
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 696b2ef..8ed3369 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S Scorable.NoSlug } else { val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + // Remove punctuation + val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } |