diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-10 19:59:40 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-10 19:59:40 -0700 |
commit | b7f77f6337b450406ae0a90b81faeba27394afb0 (patch) | |
tree | dfe2ffa3a606e6b82c69f821e4415d93ab6e0676 /scalding/src/main/scala/sandcrawler/ScoreJob.scala | |
parent | 5ce5e5dc98cdbb5a84c79313df93d670111e6a1d (diff) | |
download | sandcrawler-b7f77f6337b450406ae0a90b81faeba27394afb0.tar.gz sandcrawler-b7f77f6337b450406ae0a90b81faeba27394afb0.zip |
It compiles
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScoreJob.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScoreJob.scala | 56 |
1 files changed, 34 insertions, 22 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 0dbe64d..bc5bf87 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -2,16 +2,32 @@ package sandcrawler import cascading.flow.FlowDef import cascading.tuple.Fields -import com.twitter.scalding._ -import com.twitter.scalding.typed.TDsl._ +import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv} +//import com.twitter.scalding.typed.TDsl._ import parallelai.spyglass.base.JobBase import parallelai.spyglass.hbase.HBasePipeConversions import parallelai.spyglass.hbase.HBaseSource - -//case class MapFeatures(slug : String, json : String) +import com.twitter.scalding.{ Dsl, RichPipe, IterableSource, TupleSetter, TupleConverter } +import cascading.pipe.Pipe class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { + // TODO: Instantiate any subclass of Scorable specified in args. + val sc1 : Scorable = new GrobidScorable() + val sc2 : Scorable = new GrobidScorable() + val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args) + val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args) + + pipe1.join(pipe2).map { entry => + val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry + new ReduceOutput( + slug, + Scorable.computeSimilarity(features1, features2), + features1.json, + features2.json) + } + .write(TypedTsv[ReduceOutput](args("output"))) + /* val grobidSource = HBaseCrossrefScore.getHBaseSource( args("hbase-table"), args("zookeeper-hosts")) @@ -34,7 +50,6 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { // This leads to a compile error: val pipe11 : TypedPipe[String] = ScoreJob.getFeaturesPipe1(pipe0) - /* val pipe : cascading.pipe.Pipe = grobidSource .read val grobidPipe : TypedPipe[(String, String)] = pipe @@ -46,22 +61,6 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions { // Let's try making a method call. // ScoreJob.etFeaturesPipe(pipe) - // TODO: Instantiate any subclass of Scorable specified in args. - Scorable sc1 = new GrobidScorable() - Scorable sc2 = new CrossrefScorable() - val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(sc1.getSource().read) - val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(sc2.getSource().read) - - - pipe1.join(pipe2).map { entry => - val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry - new ReduceOutput( - slug, - Scorable.computeSimilarity(features1, features2), - features1.json, - features2.json) - } - .write(TypedTsv[ReduceOutput](args("output"))) */ } @@ -93,12 +92,25 @@ object ScoreJob { } } + /* + implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read) + + // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields + implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe = + IterableSource[T](iter)(set, conv).read + + implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = + RichPipe(toPipe(iter)(set, conv)) + + // Provide args as an implicit val for extensions such as the Checkpoint extension. +// implicit protected def _implicitJobArgs: Args = args + def getFeaturesPipe1(pipe : cascading.pipe.Pipe) : TypedPipe[String] = { pipe // The next line gives an error: value toTypedPipe is not a member of cascading.pipe.Pipe .toTypedPipe[String](new Fields("line")) } -/* + def getFeaturesPipe(pipe : cascading.pipe.Pipe) : TypedPipe[MapFeatures] = { pipe .fromBytesWritable(new Fields("key", "tei_json")) |