aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scald-mvp/build.sbt7
-rw-r--r--scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala23
2 files changed, 27 insertions, 3 deletions
diff --git a/scald-mvp/build.sbt b/scald-mvp/build.sbt
index 5a777f1..213c78b 100644
--- a/scald-mvp/build.sbt
+++ b/scald-mvp/build.sbt
@@ -8,7 +8,7 @@ lazy val root = (project in file(".")).
settings(
inThisBuild(List(
organization := "org.archive",
- scalaVersion := "2.11.6",
+ scalaVersion := "2.11.8",
version := "0.1.0-SNAPSHOT",
test in assembly := {},
)),
@@ -20,12 +20,12 @@ lazy val root = (project in file(".")).
resolvers += "Cloudera Maven Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos",
libraryDependencies += scalaTest % Test,
- libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.6",
+ libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.8",
libraryDependencies += "com.twitter" % "scalding-core_2.11" % "0.17.2",
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % hadoopVersion,
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % hadoopVersion,
libraryDependencies += "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests",
- //libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.10_0.10_CDH5_4.4",
+ libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.11_0.17.2_5.3.0",
// cargo-culted from twitter/scalding's build.sbt
// hint via https://stackoverflow.com/questions/23280494/sbt-assembly-error-deduplicate-different-file-contents-found-in-the-following#23280952
@@ -41,6 +41,7 @@ lazy val root = (project in file(".")).
case s if s.endsWith("jansi.dll") => MergeStrategy.rename
case s if s.endsWith("libjansi.so") => MergeStrategy.rename
case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines
+ case s if s.endsWith("xml") => MergeStrategy.last
case x => (mergeStrategy in assembly).value(x)
},
)
diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
new file mode 100644
index 0000000..734abaa
--- /dev/null
+++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala
@@ -0,0 +1,23 @@
+package sandcrawler
+
+import com.twitter.scalding._
+import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants}
+
+class HBaseRowCountJob(args: Args) extends Job(args) {
+
+ // For now doesn't actually count, just dumps a "word count"
+
+ val hbs = new HBaseSource(
+ "wbgrp-journal-extract-0-qa", // HBase Table Name
+ "mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?)
+ 'key, // ... then a list of column names
+ sourceMode = HBaseConstants.SourceMode.SCAN_ALL)
+/*
+ .read
+ .map { word => (word, 1L) }
+ .sumByKey
+ .write(TypedTsv[(String, Long)](args("output")))
+ // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
+ .flatMap { line => line.split("\\s+") }
+*/
+}