From b9faf4d90f630976deebad209ea2820e03281f87 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 20 Aug 2018 16:40:27 -0700 Subject: add GrobidScorableDumpJob and basic test --- .../main/scala/sandcrawler/GrobidScorableDumpJob.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala') diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala new file mode 100644 index 0000000..9a8d701 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala @@ -0,0 +1,18 @@ + +package sandcrawler + +import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase + +class GrobidScorableDumpJob(args: Args) extends JobBase(args) { + + val sc1 : Scorable = new GrobidScorable() + val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args) + + pipe1 + .map { case (slug, features) => (slug, features.json) } + .write(TypedTsv[(String, String)](args("output"))) +} -- cgit v1.2.3