aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
blob: 93cd78dc888ad101aa4373e287ffe372b2c071b0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package sandcrawler

import java.io.InputStream

import scala.io.Source
import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject

object ScorableFeatures {
  // TODO: Add exception handling.
  val fileStream : InputStream = getClass.getResourceAsStream("/slug-denylist.txt")
  val SlugDenylist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
  fileStream.close
  val MinSlugLength = 8

  // Static factory method
  def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = {
    new ScorableFeatures(
      title=if (title == null) "" else title,
      authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
      year=year,
      doi=if (doi == null) "" else doi,
      fatcat_release=if (fatcat_release == null) "" else fatcat_release,
      fatcat_work=if (fatcat_work == null) "" else fatcat_work,
      sha1=if (sha1 == null) "" else sha1)
  }
}

// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") {

  def toMap() : Map[String, Any] =
    Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1)

  override def toString() : String = {
    JSONObject(toMap).toString
  }

  def toSlug() : Option[String] = {
    if (title == null) {
      None
    } else {
      val unaccented = StringUtilities.removeAccents(title)
      // Remove punctuation
      val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
      if (slug.isEmpty
        || slug == null
        || (ScorableFeatures.SlugDenylist contains slug)
        || (slug.length < ScorableFeatures.MinSlugLength)) {
        None
      } else {
        Some(slug)
      }
    }
  }

  def toMapFeatures : Option[MapFeatures] =
    toSlug match {
      case None => None
      case Some(slug) => Some(MapFeatures(slug, toString))
    }
}