From 2656af2686aa73d0061a581bef3b9ca9d4ad8451 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 23 Aug 2018 17:50:23 -0700 Subject: set a minimum slug size (8 chars) --- scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'scalding/src/main') diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 0b9868a..9eb03f7 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -10,6 +10,7 @@ object ScorableFeatures { val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close + val MinSlugLength = 8 // Static factory method def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { @@ -38,7 +39,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug } } -- cgit v1.2.3