diff options
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 0b9868a..241db79 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -3,6 +3,7 @@ package sandcrawler import java.io.InputStream import scala.io.Source +import scala.util.parsing.json.JSONArray import scala.util.parsing.json.JSONObject object ScorableFeatures { @@ -10,11 +11,13 @@ object ScorableFeatures { val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close + val MinSlugLength = 8 // Static factory method - def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( title=if (title == null) "" else title, + authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a), year=year, doi=if (doi == null) "" else doi, sha1=if (sha1 == null) "" else sha1) @@ -23,13 +26,14 @@ object ScorableFeatures { // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") { def toMap() : Map[String, Any] = - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = + override def toString() : String = { JSONObject(toMap).toString + } def toSlug() : String = { if (title == null) { @@ -38,7 +42,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug } } |