diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-29 11:18:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-29 11:18:19 -0700 |
commit | a58a3b6aa05699b7621d856e8c6f35c25a9bc940 (patch) | |
tree | 53f5805b205953915139d491892a8990f8bd2a47 /scalding | |
parent | c14676635f39dd1bc0345e4df2d1fa06c298bfd7 (diff) | |
download | sandcrawler-a58a3b6aa05699b7621d856e8c6f35c25a9bc940.tar.gz sandcrawler-a58a3b6aa05699b7621d856e8c6f35c25a9bc940.zip |
switch HBaseRowCountJob to SCAN_ALL
Diffstat (limited to 'scalding')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala | 2 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala | 14 |
2 files changed, 11 insertions, 5 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala index 98da239..d47fe60 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -24,7 +24,7 @@ class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversio new Fields("key"), List("file"), List(new Fields("size", "mimetype")), - sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU")) + sourceMode = SourceMode.SCAN_ALL) .read .debug .groupAll { _.size('count) } diff --git a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala index ac7cf18..94b3740 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseRowCountTest.scala @@ -25,7 +25,13 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { val sampleData = List( List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "a", "b"), - List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b") + List("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", "a", "b"), + List("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", "a", "b"), + List("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", "a", "b"), + List("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", "a", "b"), + List("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", "a", "b"), + List("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", "a", "b"), + List("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", "a", "b") ) JobTest("sandcrawler.HBaseRowCountJob") @@ -42,9 +48,9 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { new Fields("key"), List("file"), List(new Fields("size", "mimetype")), - sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU")), + sourceMode = SourceMode.SCAN_ALL), sampleData.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(Bytes.toBytes(s))}):_*))) - .sink[Tuple](Tsv(output format "get_list")) { + .sink[Tuple](Tsv(output)) { outputBuffer => it("should return the test data provided.") { @@ -54,7 +60,7 @@ class HBaseRowCountTest extends FunSpec with TupleConversions { it("should return the correct count") { println("raw output => " + outputBuffer) - assert(outputBuffer(0).getObject(0) === 2) + assert(outputBuffer(0).getObject(0) === 8) } } .run |