diff options
Diffstat (limited to 'scalding')
5 files changed, 57 insertions, 7 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ab33d03..babb4f9 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -2,6 +2,7 @@ package sandcrawler  import scala.math  import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONArray  import scala.util.parsing.json.JSONObject  import cascading.flow.FlowDef @@ -52,6 +53,33 @@ object CrossrefScorable {      }    } +  def mapToAuthorList(map : Map[String, Any]) : List[String] = { +    if (map contains "author") { +      val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) +      // TODO(bnewbold): combine given and family names? +      objArray +        .filter(e => e contains "family") +        .map(e => e.get("family").get.asInstanceOf[String]) +    } else { +      List() +    } +  } + +  def mapToYear(map : Map[String, Any]) : Option[Int] = { +    map.get("created") match { +      case None => None +      case Some(created) => { +        Some(created.asInstanceOf[Map[String,Any]] +                    .get("date-parts") +                    .get +                    .asInstanceOf[List[Any]](0) +                    .asInstanceOf[List[Any]](0) +                    .asInstanceOf[Double] +                    .toInt) +      } +    } +  } +    def jsonToMapFeatures(json : String) : MapFeatures = {      Scorable.jsonToMap(json) match {        case None => MapFeatures(Scorable.NoSlug, json) @@ -60,10 +88,12 @@ object CrossrefScorable {            case None => MapFeatures(Scorable.NoSlug, json)            case Some(title) => {              val doi = Scorable.getString(map, "DOI") +            val authors: List[String] = mapToAuthorList(map) +            val year: Int = mapToYear(map).getOrElse(0)              if (doi.isEmpty || doi == null) {                MapFeatures(Scorable.NoSlug, json)              } else { -              val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) +              val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)                MapFeatures(sf.toSlug, sf.toString)              }            } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 76f4f22..c55cb40 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -51,6 +51,16 @@ object GrobidScorable {      }    } +  def mapToAuthorList(map : Map[String, Any]) : List[String] = { +    if (map contains "authors") { +      val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) +      objArray +        .filter(e => e contains "name") +        .map(e => e.get("name").get.asInstanceOf[String]) +    } else { +      List() +    } +  }    def getHBaseSource(table : String, host : String) : HBaseSource = {      HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) @@ -61,7 +71,9 @@ object GrobidScorable {        case None => MapFeatures(Scorable.NoSlug, json)        case Some(map) => {          if (map contains "title") { -          ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures +          val authors: List[String] = mapToAuthorList(map) +          val title = Scorable.getString(map, "title") +          ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures          } else {            MapFeatures(Scorable.NoSlug, json)          } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 9eb03f7..241db79 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -3,6 +3,7 @@ package sandcrawler  import java.io.InputStream  import scala.io.Source +import scala.util.parsing.json.JSONArray  import scala.util.parsing.json.JSONObject  object ScorableFeatures { @@ -13,9 +14,10 @@ object ScorableFeatures {    val MinSlugLength = 8    // Static factory method -  def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { +  def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {      new ScorableFeatures(        title=if (title == null) "" else title, +      authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),        year=year,        doi=if (doi == null) "" else doi,        sha1=if (sha1 == null) "" else sha1) @@ -24,13 +26,14 @@ object ScorableFeatures {  // Contains features needed to make slug and to score (in combination  // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") {    def toMap() : Map[String, Any] = -    Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) +    Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) -  override def toString() : String = +  override def toString() : String = {      JSONObject(toMap).toString +  }    def toSlug() : String = {      if (title == null) { diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 3d18a21..ac7cc70 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -64,7 +64,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {    "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],    "subject" : [ "Pediatrics, Perinatology, and Child Health" ]  } -""" +""".replace("<<DOI>>", "10.123/aBc")    // scalastyle:on    val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")    val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) @@ -102,6 +102,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers {        case None => fail()        case Some(map) => {          map("title").asInstanceOf[String] shouldBe "Some Title" +        map("doi").asInstanceOf[String] shouldBe "10.123/abc" +        // TODO: full name? not just a string? +        map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") +        map("year").asInstanceOf[Double].toInt shouldBe 2002        }      }    } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 6c45cc5..119cf90 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -90,6 +90,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {        case Some(map) => {          map should contain key "title"          map("title").asInstanceOf[String] shouldBe "Dummy Example File" +        map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")        }      }    } | 
