diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 17:51:43 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 17:51:43 -0700 | 
| commit | 4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d (patch) | |
| tree | d3c2069abdf0a09f72afc8402270401059cb55a5 /scald-mvp/src/main | |
| parent | c67378fbb910540050955198cb5a8da6774bccdd (diff) | |
| download | sandcrawler-4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d.tar.gz sandcrawler-4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d.zip | |
building (but nullpointer) spyglass integration
Diffstat (limited to 'scald-mvp/src/main')
| -rw-r--r-- | scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala | 23 | 
1 files changed, 23 insertions, 0 deletions
| diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala new file mode 100644 index 0000000..734abaa --- /dev/null +++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -0,0 +1,23 @@ +package sandcrawler + +import com.twitter.scalding._ +import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants} + +class HBaseRowCountJob(args: Args) extends Job(args) { + +  // For now doesn't actually count, just dumps a "word count" + +  val hbs = new HBaseSource( +    "wbgrp-journal-extract-0-qa",     // HBase Table Name +    "mtrcs-zk1.us.archive.org:2181",  // HBase Zookeeper server (to get runtime config info; can be array?) +    'key,                             // ... then a list of column names +    sourceMode = HBaseConstants.SourceMode.SCAN_ALL) +/* +    .read +    .map { word => (word, 1L) } +    .sumByKey +    .write(TypedTsv[(String, Long)](args("output"))) +    // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink +    .flatMap { line => line.split("\\s+") } +*/ +} | 
