aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
blob: 8ed3369d9ea6847b423ddad475a23a0af75e2a9f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package sandcrawler

import scala.util.parsing.json.JSONObject


// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures).
class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {

  val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
    "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
    "casereport", "commentary", "commentaryon", "commenton", "commentto",
    "contents", "correspondence", "dedication", "editorialadvisoryboard",
    "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
    "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
    "references", "results", "review", "reviewarticle", "summary", "title",
    "name")

  def toMap() : Map[String, Any] = {
    Map("title" -> (if (title == null) "" else title),
        "year" -> year,
        "doi" -> (if (doi == null) "" else doi),
        "sha1" -> (if (sha1 == null) "" else sha1))
  }

  override def toString() : String = {
    JSONObject(toMap()).toString
  }

  def toSlug() : String = {
    if (title == null) {
      Scorable.NoSlug
    } else {
      val unaccented = StringUtilities.removeAccents(title)
      // Remove punctuation
      val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
      if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
    }
  }

  def toMapFeatures = {
    MapFeatures(toSlug, toString)
  }
}