blob: 734abaae6ed510bbebf2912ea978e6b84e8b48ea (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
package sandcrawler
import com.twitter.scalding._
import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}
class HBaseRowCountJob(args: Args) extends Job(args) {
// For now doesn't actually count, just dumps a "word count"
val hbs = new HBaseSource(
"wbgrp-journal-extract-0-qa", // HBase Table Name
"mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?)
'key, // ... then a list of column names
sourceMode = HBaseConstants.SourceMode.SCAN_ALL)
/*
.read
.map { word => (word, 1L) }
.sumByKey
.write(TypedTsv[(String, Long)](args("output")))
// The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
.flatMap { line => line.split("\\s+") }
*/
}
|