aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
blob: 039fa858ff0424cc0a90cc62fa5628f7e4f98432 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package sandcrawler

import scala.math
import scala.util.parsing.json.JSON
import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject

import cascading.flow.FlowDef
import cascading.tuple.Fields
import com.twitter.scalding._
import com.twitter.scalding.typed.TDsl._
import parallelai.spyglass.hbase.HBasePipeConversions

class CrossrefScorable extends Scorable with HBasePipeConversions {
  // TODO: Generalize args so there can be multiple Crossref pipes in one job.
  def getSource(args : Args) : Source = {
    TextLine(args("crossref-input"))
  }

  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
    getSource(args).read
      .toTypedPipe[String](new Fields("line"))
      .filter { CrossrefScorable.keepRecord(_) }
      .map { CrossrefScorable.jsonToMapFeatures(_) }
  }
}

object CrossrefScorable {

  val ContentTypeWhitelist: Set[String] = Set(
    "book",
    "book-chapter",
    "dataset",
    "dissertation",
    "journal-article",
    "letter",
    "monograph",
    "posted-content",
    "pre-print",
    "proceedings-article",
    "report",
    "working-paper")

  def keepRecord(json : String) : Boolean = {
    Scorable.jsonToMap(json) match {
      case None => false
      case Some(map) => {
        mapToTitle(map) match {
          case None => false
          case Some(title) => title.length <= Scorable.MaxTitleLength
        }
      }
    }
  }

  // Returns None if title is null, empty, or too long.
  def mapToTitle(map : Map[String, Any]) : Option[String] = {
    if (map contains "title") {
      val titles = map("title").asInstanceOf[List[String]]
      if (titles.isEmpty || titles == null) {
        None
      } else {
        val title = titles(0)
        if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
      }
    } else {
      None
    }
  }

  def mapToAuthorList(map : Map[String, Any]) : List[String] = {
    if (map contains "author") {
      val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
      // TODO(bnewbold): combine given and family names?
      objArray
        .filter(e => e contains "family")
        .map(e => e.get("family").get.asInstanceOf[String])
    } else {
      List()
    }
  }

  def mapToYear(map : Map[String, Any]) : Option[Int] = {
    map.get("created") match {
      case None => None
      case Some(created) => {
        Some(created.asInstanceOf[Map[String,Any]]
                    .get("date-parts")
                    .get
                    .asInstanceOf[List[Any]](0)
                    .asInstanceOf[List[Any]](0)
                    .asInstanceOf[Double]
                    .toInt)
      }
    }
  }

  def jsonToMapFeatures(json : String) : MapFeatures = {
    Scorable.jsonToMap(json) match {
      case None => MapFeatures(Scorable.NoSlug, json)
      case Some(map) =>
        mapToTitle(map) match {
          case None => MapFeatures(Scorable.NoSlug, json)
          case Some(title) => {
            val doi = Scorable.getString(map, "DOI")
            val authors: List[String] = mapToAuthorList(map)
            val year: Int = mapToYear(map).getOrElse(0)
            val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
            if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
              MapFeatures(Scorable.NoSlug, json)
            } else {
              val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
              MapFeatures(sf.toSlug, sf.toString)
            }
          }
        }
    }
  }
}