diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 17:50:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 17:50:23 -0700 |
commit | 2656af2686aa73d0061a581bef3b9ca9d4ad8451 (patch) | |
tree | 3fd9332695067458368581aca6254a305ae1e080 /scalding/src/main/scala | |
parent | 2ab704a09db06ab776bd4cf59974e5f65f5e7c38 (diff) | |
download | sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.tar.gz sandcrawler-2656af2686aa73d0061a581bef3b9ca9d4ad8451.zip |
set a minimum slug size (8 chars)
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 0b9868a..9eb03f7 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -10,6 +10,7 @@ object ScorableFeatures { val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close + val MinSlugLength = 8 // Static factory method def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { @@ -38,7 +39,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug } } |