aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScoreJobTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala177
1 files changed, 177 insertions, 0 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
new file mode 100644
index 0000000..22cbdb8
--- /dev/null
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -0,0 +1,177 @@
+package sandcrawler
+
+import cascading.tuple.Fields
+import cascading.tuple.Tuple
+import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.scalatest._
+import parallelai.spyglass.hbase.HBaseConstants.SourceMode
+
+class ScoreJobTest extends FlatSpec with Matchers {
+ val GrobidString = """
+{
+ "title": "<<TITLE>>",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
+"""
+ val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
+ val MalformedGrobidString = GrobidString.replace("}", "")
+
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "<<DOI>>",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "<<TITLE>>" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+ val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+
+ // Pipeline tests
+ val output = "/tmp/testOutput"
+ val input = "/tmp/testInput"
+ val (testTable, testHost) = ("test-table", "dummy-host:2181")
+
+ val grobidSampleData = List(
+ List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 1"))),
+ List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 2: TNG"))),
+ List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"),
+ Bytes.toBytes(GrobidString.replace("<<TITLE>>", "Title 3: The Sequel"))),
+ List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"),
+ Bytes.toBytes(MalformedGrobidString)))
+
+ // TODO: Make less yucky.
+ ScoreJob.setScorable1(new CrossrefScorable())
+ ScoreJob.setScorable2(new GrobidScorable())
+
+ JobTest("sandcrawler.ScoreJob")
+ .arg("test", "")
+ .arg("app.conf.path", "app.conf")
+ .arg("output", output)
+ .arg("hbase-table", testTable)
+ .arg("zookeeper-hosts", testHost)
+ .arg("crossref-input", input)
+ .arg("debug", "true")
+ .source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost),
+ grobidSampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
+ .source(TextLine(input), List(
+ 0 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG").replace("<<DOI>>", "DOI-0"),
+ 1 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2").replace("<<DOI>>", "DOI-0.5"),
+ 2 -> CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
+ 3 -> CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1")))
+ .sink[ReduceOutput](TypedTsv[ReduceOutput](output)) {
+ // Grobid titles:
+ // "Title 1", "Title 2: TNG", "Title 3: The Sequel"
+ // crossref slugs:
+ // "Title 1: TNG", "Title 1: TNG 2", "Title 1: TNG 3", "Title 2 Rebooted"
+ // Join should have 3 "Title 1" slugs and 1 "Title 2" slug
+ outputBuffer =>
+ "The pipeline" should "return a 4-element list" in {
+ outputBuffer should have length 4
+ }
+
+ /*
+ it should "return the right first entry" in {
+ outputBuffer(0) shouldBe ReduceOutput("slug", 50, "",
+ "")
+ val (slug, slug0, slug1, sha1, grobidJson, crossrefJson) = outputBuffer(0)
+ slug shouldBe "title 1"
+ slug shouldBe slug0
+ slug shouldBe slug1
+ sha1 shouldBe new String(grobidSampleData(0)(0), "UTF-8")
+ grobidJson shouldBe new String(grobidSampleData(0)(1), "UTF-8")
+ }
+ */
+ }
+ .run
+ .finish
+}