aboutsummaryrefslogtreecommitdiffstats
path: root/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
blob: 734abaae6ed510bbebf2912ea978e6b84e8b48ea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
package sandcrawler

import com.twitter.scalding._
import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}

class HBaseRowCountJob(args: Args) extends Job(args) {

  // For now doesn't actually count, just dumps a "word count"

  val hbs = new HBaseSource(
    "wbgrp-journal-extract-0-qa",     // HBase Table Name
    "mtrcs-zk1.us.archive.org:2181",  // HBase Zookeeper server (to get runtime config info; can be array?)
    'key,                             // ... then a list of column names
    sourceMode = HBaseConstants.SourceMode.SCAN_ALL)
/*
    .read
    .map { word => (word, 1L) }
    .sumByKey
    .write(TypedTsv[(String, Long)](args("output")))
    // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
    .flatMap { line => line.split("\\s+") }
*/
}