aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-02 17:11:57 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-10 19:50:21 -0700
commitea9e8990139973d6f5fdf52a470bf6516c7d8c2f (patch)
tree61e63509c28b7f280e7673c27276d28e8b0782ad /scalding
parentca725ffd9efe847905afb918ff324b421a4d8859 (diff)
downloadsandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.tar.gz
sandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.zip
FatcatScorable and ScoreSelfFatcat job
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/FatcatScorable.scala131
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala43
-rw-r--r--scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala160
3 files changed, 334 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/FatcatScorable.scala b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
new file mode 100644
index 0000000..cffc2c0
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala
@@ -0,0 +1,131 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+import parallelai.spyglass.hbase.HBasePipeConversions
+
+class FatcatScorable extends Scorable with HBasePipeConversions {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("fatcat-release-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .filter { FatcatScorable.keepRecord(_) }
+ .map { FatcatScorable.jsonToMapFeatures(_) }
+ }
+}
+
+object FatcatScorable {
+
+ // Note; removed ReleaseType filtering
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ def getTitle : Option[String] = {
+ if (map contains "title") {
+ val title = map("title").asInstanceOf[String]
+ if (title == null || title.isEmpty) None else Some(title)
+ } else {
+ None
+ }
+ }
+
+ def getSubtitle : Option[String] = {
+ if (map contains "subtitle") {
+ val subtitle = map("subtitle").asInstanceOf[String]
+ if (subtitle == null || subtitle.isEmpty) {
+ None
+ } else {
+ Some(subtitle)
+ }
+ } else {
+ None
+ }
+ }
+
+ getTitle match {
+ case None => None
+ case Some(baseTitle) => {
+ if (baseTitle == null) {
+ None
+ } else {
+ getSubtitle match {
+ case None => Some(baseTitle)
+ case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle)))
+ }
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "contribs") {
+ val objArray = map("contribs").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): better name stuff... contrib.surname, creator.surname,
+ // or raw_name split to last
+ objArray
+ .filter(e => e contains "raw_name")
+ .map(e => e.get("raw_name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("release_year") match {
+ case None => None
+ case Some(year) => {
+ Some(year.asInstanceOf[Double].toInt)
+ }
+ }
+ }
+
+ def jsonToMapFeatures(json : String) : Option[MapFeatures] = {
+ def makeMapFeatures(title : String, doi : String, fatcat_release: String, fatcat_work : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = {
+ // NOTE: not doing any filtering here!
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi, fatcat_release=fatcat_release, fatcat_work=fatcat_work, year=year)
+ sf.toSlug match {
+ case None => None
+ case Some(slug) => Some(MapFeatures(slug, sf.toString))
+ }
+ }
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => None
+ case Some(title) => makeMapFeatures(
+ title=title,
+ // TODO: doi=Scorable.getString(map, "doi"),
+ doi=null,
+ fatcat_release=Scorable.getString(map, "ident"),
+ fatcat_work=Scorable.getString(map, "work_id"),
+ authors=mapToAuthorList(map),
+ year=mapToYear(map).getOrElse(0),
+ contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE"))
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala b/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala
new file mode 100644
index 0000000..d1a94fe
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala
@@ -0,0 +1,43 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class ScoreSelfFatcatJob(args: Args) extends JobBase(args) {
+
+ val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+
+ val fatcatScorable : Scorable = new FatcatScorable()
+ val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable
+ .getInputPipe(args)
+ .map { r =>
+ fatcatRowCount.inc
+ r
+ }
+
+ val joinedPipe = fatcatPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(fatcatPipe)
+
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ // filter out trivial self-matches (releases are identical)
+ .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight)
+ }
+ .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) =>
+ joinedRowCount.inc
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight),
+ fatcatFeaturesLeft.json,
+ fatcatFeaturesRight.json)
+ }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
new file mode 100644
index 0000000..823e14a
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
@@ -0,0 +1,160 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class FatcatScorableTest extends FlatSpec with Matchers {
+ // scalastyle:off
+ val FatcatString =
+"""
+{
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "W Gaier",
+ "surname": "Gaier",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Elsevier BV",
+ "pages": "186-187",
+ "ext_ids": {
+ "doi": "<<DOI>>"
+ },
+ "release_year": 1996,
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "3nccslsn5jez3ixrp5skjyjxu4",
+ "title": "<<TITLE>>",
+ "state": "active",
+ "ident": "pnri57u66ffytigdmyybbmouni",
+ "work_id": "tdmqnfzm2nggrhfwzasyegvpyu",
+ "revision": "e50bd04e-d0d4-4ee7-b7a4-6b4f079de154",
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "0987-7983(96)87729-2"
+ ],
+ "type": "journal-article"
+ }
+ }
+}
+""".replace("<<DOI>>", "10.123/aBc")
+ // scalastyle:on
+ val FatcatStringWithGoodTitle = FatcatString.replace("<<TITLE>>", "Some Title")
+ val FatcatStringWithMaximumTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val FatcatStringWithExcessiveTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val FatcatStringWithNullTitle = FatcatString.replace("\"<<TITLE>>\"", "null")
+ val FatcatStringWithEmptyTitle = FatcatString.replace("<<TITLE>>", "")
+ val FatcatStringWithoutTitle = FatcatString.replace("title", "nottitle")
+ val MalformedFatcatString = FatcatString.replace("}", "")
+ val FatcatStringWithNoAuthors = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("contribs", "no-contribs")
+ //val FatcatStringWrongType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ //val FatcatStringNoType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+ // Unit tests
+ "FatcatScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ FatcatScorable.jsonToMapFeatures(MalformedFatcatString) should be (None)
+ }
+
+ it should "handle missing title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithoutTitle) should be (None)
+ }
+
+ it should "handle null title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNullTitle) should be (None)
+ }
+
+ it should "handle empty title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithEmptyTitle) should be (None)
+ }
+
+ it should "handle subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "just right!", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article","contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
+ }
+
+ it should "handle empty subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle null subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": null, "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle missing authors" in {
+ // TODO: not actually removing these
+ //FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) should be (None)
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors)
+ }
+
+ it should "handle valid input" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithGoodTitle) match {
+ case None => fail()
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ //map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ map("fatcat_release").asInstanceOf[String] shouldBe "pnri57u66ffytigdmyybbmouni"
+ map("fatcat_work").asInstanceOf[String] shouldBe "tdmqnfzm2nggrhfwzasyegvpyu"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("W Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 1996
+ }
+ }
+ }
+ }
+ }
+
+ "FatcatScorable.keepRecord()" should "return true for valid JSON with title" in {
+ FatcatScorable.keepRecord(FatcatStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ FatcatScorable.keepRecord(FatcatStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ FatcatScorable.keepRecord(FatcatStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ FatcatScorable.keepRecord(FatcatStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+}