diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 09:58:27 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 09:58:27 -0700 |
commit | 1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb (patch) | |
tree | bf18ec3b4335403fc7f2a4ed9b9379e9cbf25634 /scalding/src/main/scala/sandcrawler/ScoreJob.scala | |
parent | 5615428921a45ba6a2fb005b255a28dcbb83b13f (diff) | |
download | sandcrawler-1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb.tar.gz sandcrawler-1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb.zip |
Pipeline works, all tests pass, no scalastyle errors.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScoreJob.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScoreJob.scala | 51 |
1 files changed, 7 insertions, 44 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 386b367..75d45e9 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -1,16 +1,12 @@ package sandcrawler -import cascading.flow.FlowDef -import cascading.tuple.Fields -import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv} -//import com.twitter.scalding.source.TypedText -import parallelai.spyglass.base.JobBase -import parallelai.spyglass.hbase.HBasePipeConversions -import parallelai.spyglass.hbase.HBaseSource -import com.twitter.scalding.{ Dsl, RichPipe, IterableSource, TupleSetter, TupleConverter } import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase -class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { +class ScoreJob(args: Args) extends JobBase(args) { // TODO: Instantiate any subclass of Scorable specified in args. val sc1 : Scorable = new GrobidScorable() val sc2 : Scorable = new CrossrefScorable() @@ -27,10 +23,10 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { } //TypedTsv doesn't work over case classes. .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } - .write(TypedTsv[(String, Int, String, String)](args("output"))) } +/* // Ugly hack to get non-String information into ScoreJob above. object ScoreJob { var scorable1 : Option[Scorable] = None @@ -57,38 +53,5 @@ object ScoreJob { case None => null } } - - /* - implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read) - - // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields - implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe = - IterableSource[T](iter)(set, conv).read - - implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = - RichPipe(toPipe(iter)(set, conv)) - - // Provide args as an implicit val for extensions such as the Checkpoint extension. -// implicit protected def _implicitJobArgs: Args = args - - def getFeaturesPipe1(pipe : cascading.pipe.Pipe) : TypedPipe[String] = { - pipe - // The next line gives an error: value toTypedPipe is not a member of cascading.pipe.Pipe - .toTypedPipe[String](new Fields("line")) - } - - def getFeaturesPipe(pipe : cascading.pipe.Pipe) : TypedPipe[MapFeatures] = { - pipe - .fromBytesWritable(new Fields("key", "tei_json")) - // I needed to change symbols to strings when I pulled this out of ScoreJob. - .toTypedPipe[(String, String)](new Fields("key", "tei_json")) - .map { entry => - val (key : String, json : String) = (entry._1, entry._2) - GrobidScorable.grobidToSlug(json) match { - case Some(slug) => new MapFeatures(slug, json) - case None => new MapFeatures(Scorable.NoSlug, json) - } - } - } - */ } + */ |