diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 17:50:43 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 19:36:20 -0700 |
commit | 6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb (patch) | |
tree | 25394312b98ad4e139f07a9b882e1f42fc13e128 /scalding | |
parent | 2656af2686aa73d0061a581bef3b9ca9d4ad8451 (diff) | |
download | sandcrawler-6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb.tar.gz sandcrawler-6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb.zip |
author parsing (and year, for crossref)
Diffstat (limited to 'scalding')
5 files changed, 57 insertions, 7 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ab33d03..babb4f9 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -2,6 +2,7 @@ package sandcrawler import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONArray import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef @@ -52,6 +53,33 @@ object CrossrefScorable { } } + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "author") { + val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + // TODO(bnewbold): combine given and family names? + objArray + .filter(e => e contains "family") + .map(e => e.get("family").get.asInstanceOf[String]) + } else { + List() + } + } + + def mapToYear(map : Map[String, Any]) : Option[Int] = { + map.get("created") match { + case None => None + case Some(created) => { + Some(created.asInstanceOf[Map[String,Any]] + .get("date-parts") + .get + .asInstanceOf[List[Any]](0) + .asInstanceOf[List[Any]](0) + .asInstanceOf[Double] + .toInt) + } + } + } + def jsonToMapFeatures(json : String) : MapFeatures = { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) @@ -60,10 +88,12 @@ object CrossrefScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(title) => { val doi = Scorable.getString(map, "DOI") + val authors: List[String] = mapToAuthorList(map) + val year: Int = mapToYear(map).getOrElse(0) if (doi.isEmpty || doi == null) { MapFeatures(Scorable.NoSlug, json) } else { - val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) + val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) MapFeatures(sf.toSlug, sf.toString) } } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 76f4f22..c55cb40 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -51,6 +51,16 @@ object GrobidScorable { } } + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "authors") { + val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + objArray + .filter(e => e contains "name") + .map(e => e.get("name").get.asInstanceOf[String]) + } else { + List() + } + } def getHBaseSource(table : String, host : String) : HBaseSource = { HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) @@ -61,7 +71,9 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures + val authors: List[String] = mapToAuthorList(map) + val title = Scorable.getString(map, "title") + ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 9eb03f7..241db79 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -3,6 +3,7 @@ package sandcrawler import java.io.InputStream import scala.io.Source +import scala.util.parsing.json.JSONArray import scala.util.parsing.json.JSONObject object ScorableFeatures { @@ -13,9 +14,10 @@ object ScorableFeatures { val MinSlugLength = 8 // Static factory method - def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( title=if (title == null) "" else title, + authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a), year=year, doi=if (doi == null) "" else doi, sha1=if (sha1 == null) "" else sha1) @@ -24,13 +26,14 @@ object ScorableFeatures { // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") { def toMap() : Map[String, Any] = - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = + override def toString() : String = { JSONObject(toMap).toString + } def toSlug() : String = { if (title == null) { diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 3d18a21..ac7cc70 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -64,7 +64,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers { "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], "subject" : [ "Pediatrics, Perinatology, and Child Health" ] } -""" +""".replace("<<DOI>>", "10.123/aBc") // scalastyle:on val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title") val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) @@ -102,6 +102,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers { case None => fail() case Some(map) => { map("title").asInstanceOf[String] shouldBe "Some Title" + map("doi").asInstanceOf[String] shouldBe "10.123/abc" + // TODO: full name? not just a string? + map("authors").asInstanceOf[List[String]] shouldBe List("Gaier") + map("year").asInstanceOf[Double].toInt shouldBe 2002 } } } diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala index 6c45cc5..119cf90 100644 --- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala @@ -90,6 +90,7 @@ class GrobidScorableTest extends FlatSpec with Matchers { case Some(map) => { map should contain key "title" map("title").asInstanceOf[String] shouldBe "Dummy Example File" + map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe") } } } |