1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
package sandcrawler
import cascading.tuple.Fields
import cascading.tuple.Tuple
import com.twitter.scalding.JobTest
import com.twitter.scalding.TextLine
import com.twitter.scalding.TupleConversions
import com.twitter.scalding.TypedTsv
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest._
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
//scalastyle:off
val JsonString = """
{
"title": "<<TITLE>>",
"authors": [
{"name": "Brewster Kahle"},
{"name": "J Doe"}
],
"journal": {
"name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
"eissn": null,
"issn": null,
"issue": null,
"publisher": null,
"volume": null
},
"date": "2000",
"doi": null,
"citations": [
{ "authors": [{"name": "A Seaperson"}],
"date": "2001",
"id": "b0",
"index": 0,
"issue": null,
"journal": "Letters in the Alphabet",
"publisher": null,
"title": "Everything is Wonderful",
"url": null,
"volume": "20"},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
"issue": null,
"journal": "The Dictionary",
"publisher": null,
"title": "All about Facts",
"url": null,
"volume": "14"}
],
"abstract": "Everything you ever wanted to know about nothing",
"body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
"acknowledgement": null,
"annex": null
}
"""
// scalastyle:on
val JsonStringWithTitle = JsonString.replace("<<TITLE>>", "Dummy Example File")
val JsonStringWithoutTitle = JsonString.replace("title", "nottitle")
val MalformedJsonString = JsonString.replace("}", "")
// Pipeline tests
val output = "/tmp/testOutput"
val input = "/tmp/testInput"
val (testTable, testHost) = ("test-table", "dummy-host:2181")
val Sha1Strings : List[String] = List(
"sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", // good
"sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", // good
"sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", // good
"sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", // bad status
"sha1:93187A85273589347598473894839443", // malformed
"sha1:024937534094897039547e9824382943") // bad status
val JsonStrings : List[String] = List(
JsonString.replace("<<TITLE>>", "Title 1"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
JsonString.replace("<<TITLE>>", "Title 1"),
MalformedJsonString,
// This will have bad status.
JsonString.replace("<<TITLE>>", "Title 2")
)
// bnewbold: status codes aren't strings, they are uint64
val Ok : Long = 200
val Bad : Long = 400
val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
.zipped
.toList
.map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) }
.map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) }
// scalastyle:off null
// Add example of lines without GROBID data
val SampleData = SampleDataHead :+ new Tuple(
new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
// scalastyle:on null
JobTest("sandcrawler.GrobidScorableDumpJob")
.arg("test", "")
.arg("app.conf.path", "app.conf")
.arg("output", output)
.arg("hbase-table", testTable)
.arg("zookeeper-hosts", testHost)
.arg("debug", "true")
.source[Tuple](GrobidScorable.getHBaseSource(testTable, testHost), SampleData)
.sink[(String, String)](TypedTsv[(String, String)](output)) {
outputBuffer =>
"The pipeline" should "return correct-length list" in {
outputBuffer should have length 3
}
}
.run
.finish
}
|