aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-23 17:50:43 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-23 19:36:20 -0700
commit6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb (patch)
tree25394312b98ad4e139f07a9b882e1f42fc13e128 /scalding
parent2656af2686aa73d0061a581bef3b9ca9d4ad8451 (diff)
downloadsandcrawler-6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb.tar.gz
sandcrawler-6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb.zip
author parsing (and year, for crossref)
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala32
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala14
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala11
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala1
5 files changed, 57 insertions, 7 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index ab33d03..babb4f9 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -2,6 +2,7 @@ package sandcrawler
import scala.math
import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
@@ -52,6 +53,33 @@ object CrossrefScorable {
}
}
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "author") {
+ val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): combine given and family names?
+ objArray
+ .filter(e => e contains "family")
+ .map(e => e.get("family").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("created") match {
+ case None => None
+ case Some(created) => {
+ Some(created.asInstanceOf[Map[String,Any]]
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
+ }
+ }
+ }
+
def jsonToMapFeatures(json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
case None => MapFeatures(Scorable.NoSlug, json)
@@ -60,10 +88,12 @@ object CrossrefScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(title) => {
val doi = Scorable.getString(map, "DOI")
+ val authors: List[String] = mapToAuthorList(map)
+ val year: Int = mapToYear(map).getOrElse(0)
if (doi.isEmpty || doi == null) {
MapFeatures(Scorable.NoSlug, json)
} else {
- val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi)
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
MapFeatures(sf.toSlug, sf.toString)
}
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 76f4f22..c55cb40 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -51,6 +51,16 @@ object GrobidScorable {
}
}
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "authors") {
+ val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ objArray
+ .filter(e => e contains "name")
+ .map(e => e.get("name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
def getHBaseSource(table : String, host : String) : HBaseSource = {
HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
@@ -61,7 +71,9 @@ object GrobidScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures
+ val authors: List[String] = mapToAuthorList(map)
+ val title = Scorable.getString(map, "title")
+ ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures
} else {
MapFeatures(Scorable.NoSlug, json)
}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 9eb03f7..241db79 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -3,6 +3,7 @@ package sandcrawler
import java.io.InputStream
import scala.io.Source
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
object ScorableFeatures {
@@ -13,9 +14,10 @@ object ScorableFeatures {
val MinSlugLength = 8
// Static factory method
- def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
new ScorableFeatures(
title=if (title == null) "" else title,
+ authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
year=year,
doi=if (doi == null) "" else doi,
sha1=if (sha1 == null) "" else sha1)
@@ -24,13 +26,14 @@ object ScorableFeatures {
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
-class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") {
def toMap() : Map[String, Any] =
- Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1)
- override def toString() : String =
+ override def toString() : String = {
JSONObject(toMap).toString
+ }
def toSlug() : String = {
if (title == null) {
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 3d18a21..ac7cc70 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -64,7 +64,7 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
"issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
"subject" : [ "Pediatrics, Perinatology, and Child Health" ]
}
-"""
+""".replace("<<DOI>>", "10.123/aBc")
// scalastyle:on
val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
@@ -102,6 +102,10 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
case None => fail()
case Some(map) => {
map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
}
}
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 6c45cc5..119cf90 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -90,6 +90,7 @@ class GrobidScorableTest extends FlatSpec with Matchers {
case Some(map) => {
map should contain key "title"
map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
}
}
}