aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-02 17:11:57 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-10 19:50:21 -0700
commitea9e8990139973d6f5fdf52a470bf6516c7d8c2f (patch)
tree61e63509c28b7f280e7673c27276d28e8b0782ad /scalding/src/test
parentca725ffd9efe847905afb918ff324b421a4d8859 (diff)
downloadsandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.tar.gz
sandcrawler-ea9e8990139973d6f5fdf52a470bf6516c7d8c2f.zip
FatcatScorable and ScoreSelfFatcat job
Diffstat (limited to 'scalding/src/test')
-rw-r--r--scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala160
1 files changed, 160 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
new file mode 100644
index 0000000..823e14a
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/FatcatScorableTest.scala
@@ -0,0 +1,160 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class FatcatScorableTest extends FlatSpec with Matchers {
+ // scalastyle:off
+ val FatcatString =
+"""
+{
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "W Gaier",
+ "surname": "Gaier",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Elsevier BV",
+ "pages": "186-187",
+ "ext_ids": {
+ "doi": "<<DOI>>"
+ },
+ "release_year": 1996,
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "3nccslsn5jez3ixrp5skjyjxu4",
+ "title": "<<TITLE>>",
+ "state": "active",
+ "ident": "pnri57u66ffytigdmyybbmouni",
+ "work_id": "tdmqnfzm2nggrhfwzasyegvpyu",
+ "revision": "e50bd04e-d0d4-4ee7-b7a4-6b4f079de154",
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "0987-7983(96)87729-2"
+ ],
+ "type": "journal-article"
+ }
+ }
+}
+""".replace("<<DOI>>", "10.123/aBc")
+ // scalastyle:on
+ val FatcatStringWithGoodTitle = FatcatString.replace("<<TITLE>>", "Some Title")
+ val FatcatStringWithMaximumTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val FatcatStringWithExcessiveTitle = FatcatString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val FatcatStringWithNullTitle = FatcatString.replace("\"<<TITLE>>\"", "null")
+ val FatcatStringWithEmptyTitle = FatcatString.replace("<<TITLE>>", "")
+ val FatcatStringWithoutTitle = FatcatString.replace("title", "nottitle")
+ val MalformedFatcatString = FatcatString.replace("}", "")
+ val FatcatStringWithNoAuthors = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("contribs", "no-contribs")
+ //val FatcatStringWrongType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ //val FatcatStringNoType = FatcatString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
+
+ // Unit tests
+ "FatcatScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
+ FatcatScorable.jsonToMapFeatures(MalformedFatcatString) should be (None)
+ }
+
+ it should "handle missing title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithoutTitle) should be (None)
+ }
+
+ it should "handle null title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNullTitle) should be (None)
+ }
+
+ it should "handle empty title" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithEmptyTitle) should be (None)
+ }
+
+ it should "handle subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "just right!", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article","contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshortjustright"
+ }
+ }
+
+ it should "handle empty subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": "", "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle null subtitle" in {
+ FatcatScorable.jsonToMapFeatures(
+ """{"title": "short but not too short", "subtitle": null, "ident": "pnri57u66ffytigdmyybbmouni", "work_id": "tdmqnfzm2nggrhfwzasyegvpyu", "DOI": "10.123/asdf", "type":"journal-article", "contribs":[{ "raw_name" : "W Gaier", "surname" : "Gaier"}]}""") match {
+ case None => fail()
+ case Some(result) => result.slug shouldBe "shortbutnottooshort"
+ }
+ }
+
+ it should "handle missing authors" in {
+ // TODO: not actually removing these
+ //FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors) should be (None)
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithNoAuthors)
+ }
+
+ it should "handle valid input" in {
+ FatcatScorable.jsonToMapFeatures(FatcatStringWithGoodTitle) match {
+ case None => fail()
+ case Some(result) => {
+ result.slug shouldBe "sometitle"
+ Scorable.jsonToMap(result.json) match {
+ case None => fail()
+ case Some(map) => {
+ map("title").asInstanceOf[String] shouldBe "Some Title"
+ //map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ map("fatcat_release").asInstanceOf[String] shouldBe "pnri57u66ffytigdmyybbmouni"
+ map("fatcat_work").asInstanceOf[String] shouldBe "tdmqnfzm2nggrhfwzasyegvpyu"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("W Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 1996
+ }
+ }
+ }
+ }
+ }
+
+ "FatcatScorable.keepRecord()" should "return true for valid JSON with title" in {
+ FatcatScorable.keepRecord(FatcatStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ FatcatScorable.keepRecord(FatcatStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ FatcatScorable.keepRecord(FatcatStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ FatcatScorable.keepRecord(FatcatStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ FatcatScorable.keepRecord(FatcatStringWithoutTitle) shouldBe false
+ }
+
+}