diff options
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..5d6dea0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,30 @@ +package sandcrawler + +import scala.util.parsing.json.JSONObject + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). +class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + def toMap() : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + override def toString() : String = { + JSONObject(toMap()).toString + } + + def toSlug() : String = { + if (title == null) { + Scorable.NoSlug + } else { + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + } + } + + def toMapFeatures = { + MapFeatures(toSlug, toString) + } +} |