aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-29 11:18:19 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-29 11:18:19 -0700
commita58a3b6aa05699b7621d856e8c6f35c25a9bc940 (patch)
tree53f5805b205953915139d491892a8990f8bd2a47 /scalding/src
parentc14676635f39dd1bc0345e4df2d1fa06c298bfd7 (diff)
downloadsandcrawler-a58a3b6aa05699b7621d856e8c6f35c25a9bc940.tar.gz
sandcrawler-a58a3b6aa05699b7621d856e8c6f35c25a9bc940.zip
switch HBaseRowCountJob to SCAN_ALL
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala2
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala14
2 files changed, 11 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
index 98da239..d47fe60 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -24,7 +24,7 @@ class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversio
new Fields("key"),
List("file"),
List(new Fields("size", "mimetype")),
- sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"))
+ sourceMode = SourceMode.SCAN_ALL)
.read
.debug
.groupAll { _.size('count) }
diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
index ac7cf18..94b3740 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala
@@ -25,7 +25,13 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
val sampleData = List(
List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "a", "b"),
- List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b")
+ List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b"),
+ List("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "a", "b"),
+ List("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "a", "b"),
+ List("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", "a", "b"),
+ List("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", "a", "b"),
+ List("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", "a", "b"),
+ List("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", "a", "b")
)
JobTest("sandcrawler.HBaseRowCountJob")
@@ -42,9 +48,9 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
new Fields("key"),
List("file"),
List(new Fields("size", "mimetype")),
- sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU")),
+ sourceMode = SourceMode.SCAN_ALL),
sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*)))
- .sink[Tuple](Tsv(output format "get_list")) {
+ .sink[Tuple](Tsv(output)) {
outputBuffer =>
it("should return the test data provided.") {
@@ -54,7 +60,7 @@ class HBaseRowCountTest extends FunSpec with TupleConversions {
it("should return the correct count") {
println("raw output => " + outputBuffer)
- assert(outputBuffer(0).getObject(0) === 2)
+ assert(outputBuffer(0).getObject(0) === 8)
}
}
.run