From 4ebb3f83cc1a438abe2af0d79e6e55c83fdb431d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Mon, 21 May 2018 17:51:43 -0700
Subject: building (but nullpointer) spyglass integration

---
 scald-mvp/build.sbt                                |  7 ++++---
 .../main/scala/sandcrawler/HBaseRowCountJob.scala  | 23 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala

diff --git a/scald-mvp/build.sbt b/scald-mvp/build.sbt
index 5a777f1..213c78b 100644
--- a/scald-mvp/build.sbt
+++ b/scald-mvp/build.sbt
@@ -8,7 +8,7 @@ lazy val root = (project in file(".")).
   settings(
     inThisBuild(List(
       organization := "org.archive",
-      scalaVersion := "2.11.6",
+      scalaVersion := "2.11.8",
       version      := "0.1.0-SNAPSHOT",
       test in assembly := {},
     )),
@@ -20,12 +20,12 @@ lazy val root = (project in file(".")).
     resolvers += "Cloudera Maven Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos",
 
     libraryDependencies += scalaTest % Test,
-    libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.6",
+    libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.8",
     libraryDependencies += "com.twitter" % "scalding-core_2.11" % "0.17.2",
     libraryDependencies += "org.apache.hadoop" % "hadoop-common" % hadoopVersion,
     libraryDependencies += "org.apache.hadoop" % "hadoop-client" % hadoopVersion,
     libraryDependencies += "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests",
-    //libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.10_0.10_CDH5_4.4",
+    libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.11_0.17.2_5.3.0",
 
     // cargo-culted from twitter/scalding's build.sbt
     // hint via https://stackoverflow.com/questions/23280494/sbt-assembly-error-deduplicate-different-file-contents-found-in-the-following#23280952
@@ -41,6 +41,7 @@ lazy val root = (project in file(".")).
         case s if s.endsWith("jansi.dll") => MergeStrategy.rename
         case s if s.endsWith("libjansi.so") => MergeStrategy.rename
         case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines
+        case s if s.endsWith("xml") => MergeStrategy.last
         case x => (mergeStrategy in assembly).value(x)
     },
   )
diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
new file mode 100644
index 0000000..734abaa
--- /dev/null
+++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -0,0 +1,23 @@
+package sandcrawler
+
+import com.twitter.scalding._
+import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}
+
+class HBaseRowCountJob(args: Args) extends Job(args) {
+
+  // For now doesn't actually count, just dumps a "word count"
+
+  val hbs = new HBaseSource(
+    "wbgrp-journal-extract-0-qa",     // HBase Table Name
+    "mtrcs-zk1.us.archive.org:2181",  // HBase Zookeeper server (to get runtime config info; can be array?)
+    'key,                             // ... then a list of column names
+    sourceMode = HBaseConstants.SourceMode.SCAN_ALL)
+/*
+    .read
+    .map { word => (word, 1L) }
+    .sumByKey
+    .write(TypedTsv[(String, Long)](args("output")))
+    // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
+    .flatMap { line => line.split("\\s+") }
+*/
+}
-- 
cgit v1.2.3