From 4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 21 May 2018 17:51:43 -0700 Subject: building (but nullpointer) spyglass integration --- .../main/scala/sandcrawler/HBaseRowCountJob.scala | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala (limited to 'scald-mvp/src/main/scala') diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala new file mode 100644 index 0000000..734abaa --- /dev/null +++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -0,0 +1,23 @@ +package sandcrawler + +import com.twitter.scalding._ +import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants} + +class HBaseRowCountJob(args: Args) extends Job(args) { + + // For now doesn't actually count, just dumps a "word count" + + val hbs = new HBaseSource( + "wbgrp-journal-extract-0-qa", // HBase Table Name + "mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?) + 'key, // ... then a list of column names + sourceMode = HBaseConstants.SourceMode.SCAN_ALL) +/* + .read + .map { word => (word, 1L) } + .sumByKey + .write(TypedTsv[(String, Long)](args("output"))) + // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink + .flatMap { line => line.split("\\s+") } +*/ +} -- cgit v1.2.3