aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
blob: f51c210e3eb43b2c777b23ff2f7e29497222d5d5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
package sandcrawler

import scala.math
import scala.util.parsing.json.JSON
import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject

import cascading.flow.FlowDef
import cascading.tuple.Fields
import com.twitter.scalding._
import com.twitter.scalding.typed.TDsl._
import parallelai.spyglass.hbase.HBasePipeConversions

class CrossrefScorable extends Scorable with HBasePipeConversions {
  // TODO: Generalize args so there can be multiple Crossref pipes in one job.
  def getSource(args : Args) : Source = {
    TextLine(args("crossref-input"))
  }

  def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
    getSource(args).read
      .toTypedPipe[String](new Fields("line"))
      .filter { CrossrefScorable.keepRecord(_) }
      .map { CrossrefScorable.jsonToMapFeatures(_) }
  }
}

object CrossrefScorable {

  val ContentTypeWhitelist: Set[String] = Set(
    "book",
    "book-chapter",
    "dataset",
    "dissertation",
    "journal-article",
    "letter",
    "monograph",
    "posted-content",
    "pre-print",
    "proceedings-article",
    "report",
    "working-paper")

  def keepRecord(json : String) : Boolean = {
    Scorable.jsonToMap(json) match {
      case None => false
      case Some(map) => {
        mapToTitle(map) match {
          case None => false
          case Some(title) => title.length <= Scorable.MaxTitleLength
        }
      }
    }
  }

  // Returns None if title is null, empty, or too long.
  def mapToTitle(map : Map[String, Any]) : Option[String] = {
    if (map contains "title") {
      val titles = map("title").asInstanceOf[List[String]]
      if (titles.isEmpty || titles == null) {
        None
      } else {
        val baseTitle: String = titles(0)
        // TODO(bnewbold): this code block is horrible
        val baseSubtitle: String = if (map contains "subtitle") {
          val subtitles = map("subtitle").asInstanceOf[List[String]]
          if (!subtitles.isEmpty && subtitles != null) {
            val sub = subtitles(0)
            if (sub != null && !sub.isEmpty && baseTitle != null) {
              sub
            } else {
              ""
            }
          } else {
            ""
          }
        } else {
          ""
        }
        val title = if (baseSubtitle.isEmpty) {
          baseTitle
        } else {
          baseTitle.concat(": ".concat(baseSubtitle))
        }
        if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
      }
    } else {
      None
    }
  }

  def mapToAuthorList(map : Map[String, Any]) : List[String] = {
    if (map contains "author") {
      val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
      // TODO(bnewbold): combine given and family names?
      objArray
        .filter(e => e contains "family")
        .map(e => e.get("family").get.asInstanceOf[String])
    } else {
      List()
    }
  }

  def mapToYear(map : Map[String, Any]) : Option[Int] = {
    map.get("created") match {
      case None => None
      case Some(created) => {
        Some(created.asInstanceOf[Map[String,Any]]
                    .get("date-parts")
                    .get
                    .asInstanceOf[List[Any]](0)
                    .asInstanceOf[List[Any]](0)
                    .asInstanceOf[Double]
                    .toInt)
      }
    }
  }

  def jsonToMapFeatures(json : String) : MapFeatures = {
    Scorable.jsonToMap(json) match {
      case None => MapFeatures(Scorable.NoSlug, json)
      case Some(map) =>
        mapToTitle(map) match {
          case None => MapFeatures(Scorable.NoSlug, json)
          case Some(title) => {
            val doi = Scorable.getString(map, "DOI")
            val authors: List[String] = mapToAuthorList(map)
            val year: Int = mapToYear(map).getOrElse(0)
            val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
            if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
              MapFeatures(Scorable.NoSlug, json)
            } else {
              val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
              MapFeatures(sf.toSlug, sf.toString)
            }
          }
        }
    }
  }
}