diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-02 17:11:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-10 19:50:21 -0700 |
commit | ea9e8990139973d6f5fdf52a470bf6516c7d8c2f (patch) | |
tree | 61e63509c28b7f280e7673c27276d28e8b0782ad | |
parent | ca725ffd9efe847905afb918ff324b421a4d8859 (diff) | |
download | sandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.tar.gz sandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.zip |
FatcatScorable and ScoreSelfFatcat job
3 files changed, 334 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/FatcatScorable.scala b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala new file mode 100644 index 0000000..cffc2c0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/FatcatScorable.scala @@ -0,0 +1,131 @@ +package sandcrawler + +import scala.math +import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONArray +import scala.util.parsing.json.JSONObject + +import cascading.flow.FlowDef +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ +import parallelai.spyglass.hbase.HBasePipeConversions + +class FatcatScorable extends Scorable with HBasePipeConversions { + + def getSource(args : Args) : Source = { + TextLine(args("fatcat-release-input")) + } + + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[Option[MapFeatures]] = { + getSource(args).read + .toTypedPipe[String](new Fields("line")) + .filter { FatcatScorable.keepRecord(_) } + .map { FatcatScorable.jsonToMapFeatures(_) } + } +} + +object FatcatScorable { + + // Note; removed ReleaseType filtering + + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + mapToTitle(map) match { + case None => false + case Some(title) => title.length <= Scorable.MaxTitleLength + } + } + } + } + + // Returns None if title is null, empty, or too long. + def mapToTitle(map : Map[String, Any]) : Option[String] = { + def getTitle : Option[String] = { + if (map contains "title") { + val title = map("title").asInstanceOf[String] + if (title == null || title.isEmpty) None else Some(title) + } else { + None + } + } + + def getSubtitle : Option[String] = { + if (map contains "subtitle") { + val subtitle = map("subtitle").asInstanceOf[String] + if (subtitle == null || subtitle.isEmpty) { + None + } else { + Some(subtitle) + } + } else { + None + } + } + + getTitle match { + case None => None + case Some(baseTitle) => { + if (baseTitle == null) { + None + } else { + getSubtitle match { + case None => Some(baseTitle) + case Some(baseSubtitle) => Some(baseTitle.concat(":".concat(baseSubtitle))) + } + } + } + } + } + + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "contribs") { + val objArray = map("contribs").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + // TODO(bnewbold): better name stuff... contrib.surname, creator.surname, + // or raw_name split to last + objArray + .filter(e => e contains "raw_name") + .map(e => e.get("raw_name").get.asInstanceOf[String]) + } else { + List() + } + } + + def mapToYear(map : Map[String, Any]) : Option[Int] = { + map.get("release_year") match { + case None => None + case Some(year) => { + Some(year.asInstanceOf[Double].toInt) + } + } + } + + def jsonToMapFeatures(json : String) : Option[MapFeatures] = { + def makeMapFeatures(title : String, doi : String, fatcat_release: String, fatcat_work : String, authors : List[String], year : Int, contentType : String) : Option[MapFeatures] = { + // NOTE: not doing any filtering here! + val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi, fatcat_release=fatcat_release, fatcat_work=fatcat_work, year=year) + sf.toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, sf.toString)) + } + } + Scorable.jsonToMap(json) match { + case None => None + case Some(map) => + mapToTitle(map) match { + case None => None + case Some(title) => makeMapFeatures( + title=title, + // TODO: doi=Scorable.getString(map, "doi"), + doi=null, + fatcat_release=Scorable.getString(map, "ident"), + fatcat_work=Scorable.getString(map, "work_id"), + authors=mapToAuthorList(map), + year=mapToYear(map).getOrElse(0), + contentType=map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")) + } + } + } +} diff --git a/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala b/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala new file mode 100644 index 0000000..d1a94fe --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScoreSelfFatcat.scala @@ -0,0 +1,43 @@ +package sandcrawler + +import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.Stat +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase + +class ScoreSelfFatcatJob(args: Args) extends JobBase(args) { + + val fatcatRowCount = Stat("fatcat-rows-filtered", "sandcrawler") + val joinedRowCount = Stat("joined-rows", "sandcrawler") + + val fatcatScorable : Scorable = new FatcatScorable() + val fatcatPipe : TypedPipe[(String, ReduceFeatures)] = fatcatScorable + .getInputPipe(args) + .map { r => + fatcatRowCount.inc + r + } + + val joinedPipe = fatcatPipe + .addTrap(TypedTsv(args("output") + ".trapped")) + .join(fatcatPipe) + + // TypedTsv doesn't work over case classes. + joinedPipe + // filter out trivial self-matches (releases are identical) + .filter { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => + Scorable.selfMatchable(fatcatFeaturesLeft, fatcatFeaturesRight) + } + .map { case (slug, (fatcatFeaturesLeft, fatcatFeaturesRight)) => + joinedRowCount.inc + new ReduceOutput( + slug, + Scorable.computeSimilarity(fatcatFeaturesLeft, fatcatFeaturesRight), + fatcatFeaturesLeft.json, + fatcatFeaturesRight.json) + } + .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } + .write(TypedTsv[(String, Int, String, String)](args("output"))) +} diff --git a/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala new file mode 100644 index 0000000..823e14a --- /dev/null +++ b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala @@ -0,0 +1,160 @@ +package sandcrawler + +import cascading.tuple.Fields +import cascading.tuple.Tuple +import com.twitter.scalding.JobTest +import com.twitter.scalding.TextLine +import com.twitter.scalding.TupleConversions +import com.twitter.scalding.TypedTsv +import org.apache.hadoop.hbase.io.ImmutableBytesWritable +import org.apache.hadoop.hbase.util.Bytes +import org.scalatest._ +import parallelai.spyglass.hbase.HBaseConstants.SourceMode + +class FatcatScorableTest extends FlatSpec with Matchers { + // scalastyle:off + val FatcatString = +""" +{ + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "W Gaier", + "surname": "Gaier", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Elsevier BV", + "pages": "186-187", + "ext_ids": { + "doi": "<<DOI>>" + }, + "release_year": 1996, + "release_stage": "published", + "release_type": "article-journal", + "container_id": "3nccslsn5jez3ixrp5skjyjxu4", + "title": "<<TITLE>>", + "state": "active", + "ident": "pnri57u66ffytigdmyybbmouni", + "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", + "revision": "e50bd04e-d0d4-4ee7-b7a4-6b4f079de154", + "extra": { + "crossref": { + "alternative-id": [ + "0987-7983(96)87729-2" + ], + "type": "journal-article" + } + } +} +""".replace("<<DOI>>", "10.123/aBc") + // scalastyle:on + val FatcatStringWithGoodTitle = FatcatString.replace("<<TITLE>>", "Some Title") + val FatcatStringWithMaximumTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength) + val FatcatStringWithExcessiveTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0") + val FatcatStringWithNullTitle = FatcatString.replace("\"<<TITLE>>\"", "null") + val FatcatStringWithEmptyTitle = FatcatString.replace("<<TITLE>>", "") + val FatcatStringWithoutTitle = FatcatString.replace("title", "nottitle") + val MalformedFatcatString = FatcatString.replace("}", "") + val FatcatStringWithNoAuthors = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("contribs", "no-contribs") + //val FatcatStringWrongType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other") + //val FatcatStringNoType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type") + + // Unit tests + "FatcatScorable.jsonToMapFeatures()" should "handle invalid JSON" in { + FatcatScorable.jsonToMapFeatures(MalformedFatcatString) should be (None) + } + + it should "handle missing title" in { + FatcatScorable.jsonToMapFeatures(FatcatStringWithoutTitle) should be (None) + } + + it should "handle null title" in { + FatcatScorable.jsonToMapFeatures(FatcatStringWithNullTitle) should be (None) + } + + it should "handle empty title" in { + FatcatScorable.jsonToMapFeatures(FatcatStringWithEmptyTitle) should be (None) + } + + it should "handle subtitle" in { + FatcatScorable.jsonToMapFeatures( + """{"title": "short but not too short", "subtitle": "just right!", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article","contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshortjustright" + } + } + + it should "handle empty subtitle" in { + FatcatScorable.jsonToMapFeatures( + """{"title": "short but not too short", "subtitle": "", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } + } + + it should "handle null subtitle" in { + FatcatScorable.jsonToMapFeatures( + """{"title": "short but not too short", "subtitle": null, "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match { + case None => fail() + case Some(result) => result.slug shouldBe "shortbutnottooshort" + } + } + + it should "handle missing authors" in { + // TODO: not actually removing these + //FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) should be (None) + FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) + } + + it should "handle valid input" in { + FatcatScorable.jsonToMapFeatures(FatcatStringWithGoodTitle) match { + case None => fail() + case Some(result) => { + result.slug shouldBe "sometitle" + Scorable.jsonToMap(result.json) match { + case None => fail() + case Some(map) => { + map("title").asInstanceOf[String] shouldBe "Some Title" + //map("doi").asInstanceOf[String] shouldBe "10.123/abc" + map("fatcat_release").asInstanceOf[String] shouldBe "pnri57u66ffytigdmyybbmouni" + map("fatcat_work").asInstanceOf[String] shouldBe "tdmqnfzm2nggrhfwzasyegvpyu" + // TODO: full name? not just a string? + map("authors").asInstanceOf[List[String]] shouldBe List("W Gaier") + map("year").asInstanceOf[Double].toInt shouldBe 1996 + } + } + } + } + } + + "FatcatScorable.keepRecord()" should "return true for valid JSON with title" in { + FatcatScorable.keepRecord(FatcatStringWithGoodTitle) shouldBe true + } + + it should "return true for valid JSON with a title of maximum permitted length" in { + FatcatScorable.keepRecord(FatcatStringWithMaximumTitle) shouldBe true + } + + it should "return false for valid JSON with excessively long title" in { + FatcatScorable.keepRecord(FatcatStringWithExcessiveTitle) shouldBe false + } + + it should "return false for valid JSON with null title" in { + FatcatScorable.keepRecord(FatcatStringWithNullTitle) shouldBe false + } + + it should "return false for valid JSON with no title" in { + FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false + } + + it should "return false for invalid JSON" in { + FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false + } + +} |