From 96ea0ddd06ee4a7c11c7d5def976749ab3675878 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Aug 2018 22:43:33 -0700 Subject: change slugification behavior to not split on colon --- scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'scalding/src/main/scala') diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 696b2ef..8ed3369 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -32,8 +32,8 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S Scorable.NoSlug } else { val unaccented = StringUtilities.removeAccents(title) - // Remove punctuation after splitting on colon. - val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + // Remove punctuation + val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug } } -- cgit v1.2.3