diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-20 21:21:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 21:25:56 -0700 |
commit | 863481572c7e96cd3ee352332bf788cfda3e8a54 (patch) | |
tree | 15a09df597eaefa4e970a9f0f893bbb12c9fcfce /scalding | |
parent | f5c52210aa331b329cd36dbd711977b065a70eb2 (diff) | |
download | sandcrawler-863481572c7e96cd3ee352332bf788cfda3e8a54.tar.gz sandcrawler-863481572c7e96cd3ee352332bf788cfda3e8a54.zip |
scalastyle
Diffstat (limited to 'scalding')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala | 17 |
1 files changed, 8 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala index 05e7074..468b68e 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala @@ -1,17 +1,16 @@ package sandcrawler -import cascading.pipe.Pipe -import com.twitter.scalding._ -import com.twitter.scalding.typed.TDsl._ -import parallelai.spyglass.base.JobBase - import cascading.flow.FlowDef +import cascading.pipe.Pipe import cascading.tuple.Fields import com.twitter.scalding._ +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ import com.twitter.scalding.typed.TDsl._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes +import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBaseConstants.SourceMode import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource @@ -28,7 +27,7 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) { // Can't just "fromBytesWritable" because we have multiple types? .toTypedPipe[(ImmutableBytesWritable,ImmutableBytesWritable,ImmutableBytesWritable)](new Fields("key", "metadata", "status_code")) .filter { case (_, metadata, status_code) => - grobidHbaseRows.inc + grobidHbaseRows.inc metadata != null && status_code != null } .map { case (key, metadata, status_code) => @@ -40,12 +39,12 @@ class GrobidScorableDumpJob(args: Args) extends JobBase(args) { (key, json) } .map { entry : (String, String) => - parsedGrobidRows.inc + parsedGrobidRows.inc GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } .filter { entry => Scorable.isValidSlug(entry.slug) } - .map { entry => - validGrobidRows.inc + .map { entry => + validGrobidRows.inc entry } // XXX: this groupBy after the map? |