diff options
Diffstat (limited to 'scald-mvp')
-rw-r--r-- | scald-mvp/build.sbt | 7 | ||||
-rw-r--r-- | scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala | 23 |
2 files changed, 27 insertions, 3 deletions
diff --git a/scald-mvp/build.sbt b/scald-mvp/build.sbt index 5a777f1..213c78b 100644 --- a/scald-mvp/build.sbt +++ b/scald-mvp/build.sbt @@ -8,7 +8,7 @@ lazy val root = (project in file(".")). settings( inThisBuild(List( organization := "org.archive", - scalaVersion := "2.11.6", + scalaVersion := "2.11.8", version := "0.1.0-SNAPSHOT", test in assembly := {}, )), @@ -20,12 +20,12 @@ lazy val root = (project in file(".")). resolvers += "Cloudera Maven Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos", libraryDependencies += scalaTest % Test, - libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.6", + libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.8", libraryDependencies += "com.twitter" % "scalding-core_2.11" % "0.17.2", libraryDependencies += "org.apache.hadoop" % "hadoop-common" % hadoopVersion, libraryDependencies += "org.apache.hadoop" % "hadoop-client" % hadoopVersion, libraryDependencies += "org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion classifier "tests", - //libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.10_0.10_CDH5_4.4", + libraryDependencies += "parallelai" % "parallelai.spyglass" % "2.11_0.17.2_5.3.0", // cargo-culted from twitter/scalding's build.sbt // hint via https://stackoverflow.com/questions/23280494/sbt-assembly-error-deduplicate-different-file-contents-found-in-the-following#23280952 @@ -41,6 +41,7 @@ lazy val root = (project in file(".")). case s if s.endsWith("jansi.dll") => MergeStrategy.rename case s if s.endsWith("libjansi.so") => MergeStrategy.rename case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines + case s if s.endsWith("xml") => MergeStrategy.last case x => (mergeStrategy in assembly).value(x) }, ) diff --git a/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala new file mode 100644 index 0000000..734abaa --- /dev/null +++ b/scald-mvp/src/main/scala/sandcrawler/HBaseRowCountJob.scala @@ -0,0 +1,23 @@ +package sandcrawler + +import com.twitter.scalding._ +import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions, HBaseConstants} + +class HBaseRowCountJob(args: Args) extends Job(args) { + + // For now doesn't actually count, just dumps a "word count" + + val hbs = new HBaseSource( + "wbgrp-journal-extract-0-qa", // HBase Table Name + "mtrcs-zk1.us.archive.org:2181", // HBase Zookeeper server (to get runtime config info; can be array?) + 'key, // ... then a list of column names + sourceMode = HBaseConstants.SourceMode.SCAN_ALL) +/* + .read + .map { word => (word, 1L) } + .sumByKey + .write(TypedTsv[(String, Long)](args("output"))) + // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink + .flatMap { line => line.split("\\s+") } +*/ +} |