aboutsummaryrefslogtreecommitdiffstats
path: root/scald-mvp/src/main
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-21 17:51:43 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-21 17:51:43 -0700
commit4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d (patch)
treed3c2069abdf0a09f72afc8402270401059cb55a5 /scald-mvp/src/main
parentc67378fbb910540050955198cb5a8da6774bccdd (diff)
downloadsandcrawler-4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d.tar.gz
sandcrawler-4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d.zip
building (but nullpointer) spyglass integration
Diffstat (limited to 'scald-mvp/src/main')
-rw-r--r--scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala23
1 files changed, 23 insertions, 0 deletions
diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
new file mode 100644
index 0000000..734abaa
--- /dev/null
+++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -0,0 +1,23 @@
+package sandcrawler
+
+import com.twitter.scalding._
+import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}
+
+class HBaseRowCountJob(args: Args) extends Job(args) {
+
+ // For now doesn't actually count, just dumps a "word count"
+
+ val hbs = new HBaseSource(
+ "wbgrp-journal-extract-0-qa", // HBase Table Name
+ "mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?)
+ 'key, // ... then a list of column names
+ sourceMode = HBaseConstants.SourceMode.SCAN_ALL)
+/*
+ .read
+ .map { word => (word, 1L) }
+ .sumByKey
+ .write(TypedTsv[(String, Long)](args("output")))
+ // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
+ .flatMap { line => line.split("\\s+") }
+*/
+}