diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-07-19 16:40:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-07-24 11:27:45 -0700 |
commit | f6c88b66cea8919fe8a0a438e60841ad682aa71d (patch) | |
tree | 3ffea7ef8c658c701b5060e849271b1420d3692f /scalding | |
parent | c36e59bf03e692d22d6d72aa5ae37977e3a13524 (diff) | |
download | sandcrawler-f6c88b66cea8919fe8a0a438e60841ad682aa71d.tar.gz sandcrawler-f6c88b66cea8919fe8a0a438e60841ad682aa71d.zip |
some scalastyle on CdxBackfillJob
Diffstat (limited to 'scalding')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala index 0af3c9c..4a2eaba 100644 --- a/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala +++ b/scalding/src/main/scala/sandcrawler/CdxBackfillJob.scala @@ -1,15 +1,17 @@ package sandcrawler +import java.util.Properties + import cascading.property.AppProps +import cascading.tap.SinkMode import cascading.tuple.Fields import cascading.pipe.joiner._ import com.twitter.scalding._ import com.twitter.scalding.typed.TDsl._ -import java.util.Properties -import cascading.tap.SinkMode import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBaseConstants.SourceMode -import parallelai.spyglass.hbase.{HBaseSource, HBasePipeConversions} +import parallelai.spyglass.hbase.HBasePipeConversions +import parallelai.spyglass.hbase.HBaseSource import scala.util.parsing.json.JSONObject // Type that represents a raw parsed CDX line @@ -82,7 +84,7 @@ class CdxBackfillJob(args: Args) extends JobBase(args) with HBasePipeConversions object CdxBackfillJob { def getHBaseSource(hbase_table: String, zookeeper_hosts: String) : HBaseSource = { - return HBaseBuilder.build( + HBaseBuilder.build( hbase_table, zookeeper_hosts, List("file:size"), // not actually needed @@ -90,7 +92,7 @@ object CdxBackfillJob { } def getHBaseSink(hbase_table: String, zookeeper_hosts: String) : HBaseSource = { - return HBaseBuilder.buildSink( + HBaseBuilder.buildSink( hbase_table, zookeeper_hosts, List("f:c", "file:cdx", "file:mime"), @@ -118,8 +120,7 @@ object CdxBackfillJob { if (lower.startsWith("application/x-pdf")) { return "application/pdf" } - return lower - + lower } def isCdxLine(line: String) : Boolean = { |