aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala17
1 files changed, 12 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 0b9868a..241db79 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -3,6 +3,7 @@ package sandcrawler
import java.io.InputStream
import scala.io.Source
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
object ScorableFeatures {
@@ -10,11 +11,13 @@ object ScorableFeatures {
val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
fileStream.close
+ val MinSlugLength = 8
// Static factory method
- def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
new ScorableFeatures(
title=if (title == null) "" else title,
+ authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
year=year,
doi=if (doi == null) "" else doi,
sha1=if (sha1 == null) "" else sha1)
@@ -23,13 +26,14 @@ object ScorableFeatures {
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
-class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") {
def toMap() : Map[String, Any] =
- Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1)
- override def toString() : String =
+ override def toString() : String = {
JSONObject(toMap).toString
+ }
def toSlug() : String = {
if (title == null) {
@@ -38,7 +42,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug
}
}