diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 10:55:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-21 10:55:22 -0700 |
commit | 5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5 (patch) | |
tree | 5f7322b2dc25317ab847d98e182ce801a1a5eae6 /scald-mvp/src/main | |
parent | b18c68c81b4c426b5d83f2e6c31026b9febcb6e0 (diff) | |
download | sandcrawler-5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5.tar.gz sandcrawler-5d5b828730fdf34dcd2a6aeba64c7df2c1be23c5.zip |
copy in scalding learning example
Diffstat (limited to 'scald-mvp/src/main')
-rw-r--r-- | scald-mvp/src/main/scala/example/WordCount.scala | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/scald-mvp/src/main/scala/example/WordCount.scala b/scald-mvp/src/main/scala/example/WordCount.scala new file mode 100644 index 0000000..f7230c1 --- /dev/null +++ b/scald-mvp/src/main/scala/example/WordCount.scala @@ -0,0 +1,23 @@ + +package example + +import com.twitter.scalding._ + +object WordCountJob { + + def main(args: Array[String]) { + (new WordCountJob(Args(List("--local", "", "--input", "dummy.txt", "--output", "dummy-out.txt")))).run + + import io.Source + for (line <- Source.fromFile("dummy-out.txt").getLines()) + println(line) + } +} + +class WordCountJob(args : Args) extends Job(args) { + TypedPipe.from(TextLine(args("input"))) + .flatMap { line => line.split("""\s+""") } + .groupBy { word => word } + .size + .write(TypedTsv(args("output"))) +} |