aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-21 10:55:22 -0700
committerBryan Newbold <bnewbold@archive.org>2018-05-21 10:55:22 -0700
commit5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5 (patch)
tree5f7322b2dc25317ab847d98e182ce801a1a5eae6
parentb18c68c81b4c426b5d83f2e6c31026b9febcb6e0 (diff)
downloadsandcrawler-5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5.tar.gz
sandcrawler-5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5.zip
copy in scalding learning example
-rw-r--r--scald-mvp/README.md30
-rw-r--r--scald-mvp/build.sbt33
-rw-r--r--scald-mvp/project/Dependencies.scala5
-rw-r--r--scald-mvp/project/assembly.sbt1
-rw-r--r--scald-mvp/project/build.properties1
-rw-r--r--scald-mvp/src/main/scala/example/WordCount.scala23
6 files changed, 93 insertions, 0 deletions
diff --git a/scald-mvp/README.md b/scald-mvp/README.md
new file mode 100644
index 0000000..10cac0f
--- /dev/null
+++ b/scald-mvp/README.md
@@ -0,0 +1,30 @@
+
+following https://medium.com/@gayani.nan/how-to-run-a-scalding-job-567160fa193
+
+
+running on my laptop:
+
+ openjdk version "1.8.0_171"
+ OpenJDK Runtime Environment (build 1.8.0_171-8u171-b11-1~deb9u1-b11)
+ OpenJDK 64-Bit Server VM (build 25.171-b11, mixed mode)
+
+ Scala code runner version 2.11.8 -- Copyright 2002-2016, LAMP/EPFL
+
+ sbt: 1.1.5
+
+ sbt new scala/scala-seed.g8
+
+ # inserted additional deps, tweaked versions
+ # hadoop 2.5.0 seems to conflict with cascading; sticking with 2.6.0
+
+ sbt assembly
+ scp target/scala-2.11/scald-mvp-assembly-0.1.0-SNAPSHOT.jar devbox:
+
+ # on cluster:
+ yarn jar scald-mvp-assembly-0.1.0-SNAPSHOT.jar WordCount --hdfs --input hdfs:///user/bnewbold/dummy.txt
+
+## ATTIC
+
+wrote build.sbt from scratch
+
+`sbt` command from `twitter/scalding` upstream repo
diff --git a/scald-mvp/build.sbt b/scald-mvp/build.sbt
new file mode 100644
index 0000000..5e3d9f7
--- /dev/null
+++ b/scald-mvp/build.sbt
@@ -0,0 +1,33 @@
+import Dependencies._
+
+lazy val root = (project in file(".")).
+ settings(
+ inThisBuild(List(
+ organization := "org.archive",
+ scalaVersion := "2.11.6",
+ version := "0.1.0-SNAPSHOT",
+ test in assembly := {},
+ )),
+ name := "scald-mvp",
+ libraryDependencies += scalaTest % Test,
+ libraryDependencies += "org.scala-lang" % "scala-library" % "2.11.6",
+ libraryDependencies += "com.twitter" % "scalding-core_2.11" % "0.17.2",
+ libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.6.0",
+
+ // cargo-culted from twitter/scalding's build.sbt
+ // hint via https://stackoverflow.com/questions/23280494/sbt-assembly-error-deduplicate-different-file-contents-found-in-the-following#23280952
+ mergeStrategy in assembly := {
+ case s if s.endsWith(".class") => MergeStrategy.last
+ case s if s.endsWith("project.clj") => MergeStrategy.concat
+ case s if s.endsWith(".html") => MergeStrategy.last
+ case s if s.endsWith(".dtd") => MergeStrategy.last
+ case s if s.endsWith(".xsd") => MergeStrategy.last
+ case s if s.endsWith("pom.properties") => MergeStrategy.last
+ case s if s.endsWith("pom.xml") => MergeStrategy.last
+ case s if s.endsWith(".jnilib") => MergeStrategy.rename
+ case s if s.endsWith("jansi.dll") => MergeStrategy.rename
+ case s if s.endsWith("libjansi.so") => MergeStrategy.rename
+ case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines
+ case x => (mergeStrategy in assembly).value(x)
+ },
+ )
diff --git a/scald-mvp/project/Dependencies.scala b/scald-mvp/project/Dependencies.scala
new file mode 100644
index 0000000..558929d
--- /dev/null
+++ b/scald-mvp/project/Dependencies.scala
@@ -0,0 +1,5 @@
+import sbt._
+
+object Dependencies {
+ lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.0.5"
+}
diff --git a/scald-mvp/project/assembly.sbt b/scald-mvp/project/assembly.sbt
new file mode 100644
index 0000000..652a3b9
--- /dev/null
+++ b/scald-mvp/project/assembly.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
diff --git a/scald-mvp/project/build.properties b/scald-mvp/project/build.properties
new file mode 100644
index 0000000..31334bb
--- /dev/null
+++ b/scald-mvp/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.1.1
diff --git a/scald-mvp/src/main/scala/example/WordCount.scala b/scald-mvp/src/main/scala/example/WordCount.scala
new file mode 100644
index 0000000..f7230c1
--- /dev/null
+++ b/scald-mvp/src/main/scala/example/WordCount.scala
@@ -0,0 +1,23 @@
+
+package example
+
+import com.twitter.scalding._
+
+object WordCountJob {
+
+ def main(args: Array[String]) {
+ (new WordCountJob(Args(List("--local", "", "--input", "dummy.txt", "--output", "dummy-out.txt")))).run
+
+ import io.Source
+ for (line <- Source.fromFile("dummy-out.txt").getLines())
+ println(line)
+ }
+}
+
+class WordCountJob(args : Args) extends Job(args) {
+ TypedPipe.from(TextLine(args("input")))
+ .flatMap { line => line.split("""\s+""") }
+ .groupBy { word => word }
+ .size
+ .write(TypedTsv(args("output")))
+}