aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
blob: 9a8d701e9f4f52c7a66f038a1d873c0af76e2231 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

package sandcrawler

import cascading.pipe.Pipe
import com.twitter.scalding.Args
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.TypedTsv
import parallelai.spyglass.base.JobBase

class GrobidScorableDumpJob(args: Args) extends JobBase(args) {

  val sc1 : Scorable = new GrobidScorable()
  val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)

  pipe1
    .map { case (slug, features) => (slug, features.json) }
    .write(TypedTsv[(String, String)](args("output")))
}