aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala44
1 files changed, 44 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
new file mode 100644
index 0000000..8ed3369
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -0,0 +1,44 @@
+package sandcrawler
+
+import scala.util.parsing.json.JSONObject
+
+
+// Contains features needed to make slug and to score (in combination
+// with a second ScorableFeatures).
+class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+
+ val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
+ "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
+ "casereport", "commentary", "commentaryon", "commenton", "commentto",
+ "contents", "correspondence", "dedication", "editorialadvisoryboard",
+ "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
+ "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
+ "references", "results", "review", "reviewarticle", "summary", "title",
+ "name")
+
+ def toMap() : Map[String, Any] = {
+ Map("title" -> (if (title == null) "" else title),
+ "year" -> year,
+ "doi" -> (if (doi == null) "" else doi),
+ "sha1" -> (if (sha1 == null) "" else sha1))
+ }
+
+ override def toString() : String = {
+ JSONObject(toMap()).toString
+ }
+
+ def toSlug() : String = {
+ if (title == null) {
+ Scorable.NoSlug
+ } else {
+ val unaccented = StringUtilities.removeAccents(title)
+ // Remove punctuation
+ val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
+ if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
+ }
+ }
+
+ def toMapFeatures = {
+ MapFeatures(toSlug, toString)
+ }
+}