aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-23 17:50:23 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-23 17:50:23 -0700
commit2656af2686aa73d0061a581bef3b9ca9d4ad8451 (patch)
tree3fd9332695067458368581aca6254a305ae1e080 /scalding/src/main
parent2ab704a09db06ab776bd4cf59974e5f65f5e7c38 (diff)
downloadsandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.tar.gz
sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.zip
set a minimum slug size (8 chars)
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala6
1 files changed, 5 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 0b9868a..9eb03f7 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -10,6 +10,7 @@ object ScorableFeatures {
val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
fileStream.close
+ val MinSlugLength = 8
// Static factory method
def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
@@ -38,7 +39,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug
}
}