aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-21 19:18:52 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-21 19:18:52 -0700
commitaafec7a8307342479cacfc5aa6a794068b799823 (patch)
tree4f8e3534a17f76e1935ae676a37ca0b90883e4fd
parente906399e4bb54bfe5a2124bd13aa78733bcac03b (diff)
downloadsandcrawler-aafec7a8307342479cacfc5aa6a794068b799823.tar.gz
sandcrawler-aafec7a8307342479cacfc5aa6a794068b799823.zip
another attempt at a simple job variation
-rw-r--r--scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala19
1 files changed, 16 insertions, 3 deletions
diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
index 23c4764..162f729 100644
--- a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
+++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -1,11 +1,16 @@
package sandcrawler
import com.twitter.scalding._
+import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}
import parallelai.spyglass.hbase.HBaseConstants.SourceMode
import cascading.tuple.Fields
+import cascading.property.AppProps
+import java.util.Properties
+
+
+class HBaseRowCountJob(args: Args) extends JobBase(args) with HBasePipeConversions {
-class HBaseRowCountJob(args: Args) extends Job(args) with HBasePipeConversions {
// For now doesn't actually count, just dumps a "word count"
@@ -14,11 +19,19 @@ class HBaseRowCountJob(args: Args) extends Job(args) with HBasePipeConversions {
val hbs = new HBaseSource(
"wbgrp-journal-extract-0-qa", // HBase Table Name
"mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?)
- new Fields("key"),
- List("column_family"),
+ new Fields("key"),
+ sourceMode = SourceMode.GET_LIST, keyList = List("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", "sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"))
+ .read
+ .debug
+ .fromBytesWritable(new Fields("key"))
+ .write(Tsv(output format "get_list"))
+
+ /*
+ List("column_family"),
sourceMode = SourceMode.SCAN_ALL)
.read
.debug
.fromBytesWritable(new Fields("key"))
.write(Tsv(output format "get_list"))
+ */
}