diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 19:18:52 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 19:18:52 -0700 | 
| commit | aafec7a8307342479cacfc5aa6a794068b799823 (patch) | |
| tree | 4f8e3534a17f76e1935ae676a37ca0b90883e4fd /scald-mvp/src/main | |
| parent | e906399e4bb54bfe5a2124bd13aa78733bcac03b (diff) | |
| download | sandcrawler-aafec7a8307342479cacfc5aa6a794068b799823.tar.gz sandcrawler-aafec7a8307342479cacfc5aa6a794068b799823.zip | |
another attempt at a simple job variation
Diffstat (limited to 'scald-mvp/src/main')
| -rw-r--r-- | scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala | 19 | 
1 files changed, 16 insertions, 3 deletions
| diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala index 23c4764..162f729 100644 --- a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala +++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -1,11 +1,16 @@  package sandcrawler  import com.twitter.scalding._ +import parallelai.spyglass.base.JobBase  import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}  import parallelai.spyglass.hbase.HBaseConstants.SourceMode  import cascading.tuple.Fields +import cascading.property.AppProps +import java.util.Properties + + +class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversions { -class HBaseRowCountJob(args: Args) extends Job(args) with HBasePipeConversions {    // For now doesn't actually count, just dumps a "word count" @@ -14,11 +19,19 @@ class HBaseRowCountJob(args: Args) extends Job(args) with HBasePipeConversions {    val hbs = new HBaseSource(      "wbgrp-journal-extract-0-qa",     // HBase Table Name      "mtrcs-zk1.us.archive.org:2181",  // HBase Zookeeper server (to get runtime config info; can be array?) -     new Fields("key"), -     List("column_family"), +    new Fields("key"), +    sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU")) +    .read +    .debug +    .fromBytesWritable(new Fields("key")) +    .write(Tsv(output format "get_list")) + +    /* +    List("column_family"),      sourceMode = SourceMode.SCAN_ALL)      .read      .debug      .fromBytesWritable(new Fields("key"))      .write(Tsv(output format "get_list")) +    */  } | 
