aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/Scorable.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-12 18:08:51 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-12 18:08:51 -0700
commit31354b1a6062c5c56a30610f68fa48c82a7e83f0 (patch)
treea730150c3f29ea76579ee6928a7c2db9e5b22eac /scalding/src/main/scala/sandcrawler/Scorable.scala
parent728e50a33cec921c9a624439f2e1c8561a6e12ce (diff)
downloadsandcrawler-31354b1a6062c5c56a30610f68fa48c82a7e83f0.tar.gz
sandcrawler-31354b1a6062c5c56a30610f68fa48c82a7e83f0.zip
Tests pass.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/Scorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala11
1 files changed, 6 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9c8da69..929461b 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -66,13 +66,14 @@ object Scorable {
// This guarantees it will have all of the fields needed to compute
// the ultimate score, which are a superset of those needed for a slug.
def mapToSlug(map : Map[String, Any]) : String = {
- val unaccented = StringUtilities.removeAccents(getString(map, "title"))
- // Remove punctuation after splitting on colon.
- val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase()))
- if (slug.isEmpty || slug == null) {
+ val title = getString(map, "title")
+ if (title == null) {
NoSlug
} else {
- slug
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation after splitting on colon.
+ val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty || slug == null) NoSlug else slug
}
}