aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/test/scala/sandcrawler/ScoreJobTest.scala')
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala97
1 files changed, 52 insertions, 45 deletions
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 1c6ae83..34081a5 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -2,13 +2,17 @@ package sandcrawler
import cascading.tuple.Fields
import cascading.tuple.Tuple
-import com.twitter.scalding.{JobTest, TextLine, TypedTsv, TupleConversions}
+import com.twitter.scalding.JobTest
+import com.twitter.scalding.TextLine
+import com.twitter.scalding.TupleConversions
+import com.twitter.scalding.TypedTsv
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
class ScoreJobTest extends FlatSpec with Matchers {
+ //scalastyle:off
val JsonString = """
{
"title": "<<TITLE>>",
@@ -54,62 +58,65 @@ class ScoreJobTest extends FlatSpec with Matchers {
"annex": null
}
"""
+ // scalastyle:on
val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
val MalformedJsonString = JsonString.replace("}", "")
+ // scalastyle:off
val CrossrefString =
"""
-{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
- "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
- "date-time" : "2017-10-23T17:19:16Z",
- "timestamp" : { "$numberLong" : "1508779156477" } },
- "reference-count" : 0,
- "publisher" : "Elsevier BV",
- "issue" : "3",
- "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
- "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
- "date-time" : "1996-01-01T00:00:00Z",
- "timestamp" : { "$numberLong" : "820454400000" } },
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
"delay-in-days" : 0, "content-version" : "tdm" }],
- "content-domain" : { "domain" : [], "crossmark-restriction" : false },
- "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
"DOI" : "<<DOI>>",
- "type" : "journal-article",
- "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
- "date-time" : "2002-07-25T15:09:41Z",
- "timestamp" : { "$numberLong" : "1027609781000" } },
- "page" : "186-187",
- "source" : "Crossref",
- "is-referenced-by-count" : 0,
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
"title" : [ "<<TITLE>>" ],
- "prefix" : "10.1016",
- "volume" : "9",
- "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
- "member" : "78",
- "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
"link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
- "content-type" : "text/xml",
- "content-version" : "vor",
- "intended-application" : "text-mining" },
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
{ "URL" :
"http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
- "content-type" : "text/plain",
+ "content-type" : "text/plain",
"content-version" : "vor",
- "intended-application" : "text-mining" } ],
- "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
- "date-time" : "2015-09-03T10:03:43Z",
- "timestamp" : { "$numberLong" : "1441274623000" } },
- "score" : 1,
- "issued" : { "date-parts" : [ [ 1996 ] ] },
- "references-count" : 0,
- "alternative-id" : [ "0987-7983(96)87729-2" ],
- "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
- "ISSN" : [ "0987-7983" ],
- "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
"subject" : [ "Pediatrics, Perinatology, and Child Health" ]
}
"""
+ // scalastyle:on
val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -168,7 +175,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
2 -> CrossrefStrings(2),
3 -> CrossrefStrings(3)))
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
- // Grobid titles and slugs (in parentheses):
+ // Grobid titles and slugs (in parentheses):
// Title 1 (title1)
// Title 2: TNG (title2)
// Title 3: The Sequel (title3)
@@ -178,7 +185,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
// Title 1: TNG 3 (title1)
// Title 2: Rebooted (title2)
// Join should have 3 "title1" slugs and 1 "title2" slug
- outputBuffer =>
+ outputBuffer =>
"The pipeline" should "return a 4-element list" in {
outputBuffer should have length 4
}
@@ -190,9 +197,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
countMap("title2") shouldBe 1
}
- def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) = {
+ def bundle(slug : String, grobidIndex : Int, crossrefIndex : Int) : (String, Int, String, String) = {
val mf1 : MapFeatures = GrobidScorable.jsonToMapFeatures(
- Sha1Strings(grobidIndex),
+ Sha1Strings(grobidIndex),
JsonStrings(grobidIndex))
val mf2 : MapFeatures = CrossrefScorable.jsonToMapFeatures(
CrossrefStrings(crossrefIndex))