aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/CrossrefScorable.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala98
1 files changed, 85 insertions, 13 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5d1eaf5..039fa85 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -2,6 +2,7 @@ package sandcrawler
import scala.math
import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
@@ -19,29 +20,100 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
.map { CrossrefScorable.jsonToMapFeatures(_) }
}
}
object CrossrefScorable {
+
+ val ContentTypeWhitelist: Set[String] = Set(
+ "book",
+ "book-chapter",
+ "dataset",
+ "dissertation",
+ "journal-article",
+ "letter",
+ "monograph",
+ "posted-content",
+ "pre-print",
+ "proceedings-article",
+ "report",
+ "working-paper")
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) {
+ None
+ } else {
+ val title = titles(0)
+ if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
+ }
+ } else {
+ None
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "author") {
+ val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): combine given and family names?
+ objArray
+ .filter(e => e contains "family")
+ .map(e => e.get("family").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("created") match {
+ case None => None
+ case Some(created) => {
+ Some(created.asInstanceOf[Map[String,Any]]
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
+ }
+ }
+ }
+
def jsonToMapFeatures(json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
case None => MapFeatures(Scorable.NoSlug, json)
- case Some(map) => {
- if ((map contains "title") && (map contains "DOI")) {
- val titles = map("title").asInstanceOf[List[String]]
- val doi = Scorable.getString(map, "DOI")
- if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
- new MapFeatures(Scorable.NoSlug, json)
- } else {
- // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
- new MapFeatures(sf.toSlug, sf.toString)
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(title) => {
+ val doi = Scorable.getString(map, "DOI")
+ val authors: List[String] = mapToAuthorList(map)
+ val year: Int = mapToYear(map).getOrElse(0)
+ val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ MapFeatures(sf.toSlug, sf.toString)
+ }
}
- } else {
- new MapFeatures(Scorable.NoSlug, json)
}
- }
}
}
}