aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala/sandcrawler/ScoreJob.scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-10 19:59:40 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-10 19:59:40 -0700
commitb7f77f6337b450406ae0a90b81faeba27394afb0 (patch)
treedfe2ffa3a606e6b82c69f821e4415d93ab6e0676 /scalding/src/main/scala/sandcrawler/ScoreJob.scala
parent5ce5e5dc98cdbb5a84c79313df93d670111e6a1d (diff)
downloadsandcrawler-b7f77f6337b450406ae0a90b81faeba27394afb0.tar.gz
sandcrawler-b7f77f6337b450406ae0a90b81faeba27394afb0.zip
It compiles
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScoreJob.scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreJob.scala56
1 files changed, 34 insertions, 22 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 0dbe64d..bc5bf87 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -2,16 +2,32 @@ package sandcrawler
import cascading.flow.FlowDef
import cascading.tuple.Fields
-import com.twitter.scalding._
-import com.twitter.scalding.typed.TDsl._
+import com.twitter.scalding.{Args,Source,TextLine,TypedPipe, TypedTsv}
+//import com.twitter.scalding.typed.TDsl._
import parallelai.spyglass.base.JobBase
import parallelai.spyglass.hbase.HBasePipeConversions
import parallelai.spyglass.hbase.HBaseSource
-
-//case class MapFeatures(slug : String, json : String)
+import com.twitter.scalding.{ Dsl, RichPipe, IterableSource, TupleSetter, TupleConverter }
+import cascading.pipe.Pipe
class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
+ // TODO: Instantiate any subclass of Scorable specified in args.
+ val sc1 : Scorable = new GrobidScorable()
+ val sc2 : Scorable = new GrobidScorable()
+ val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)
+ val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args)
+
+ pipe1.join(pipe2).map { entry =>
+ val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(features1, features2),
+ features1.json,
+ features2.json)
+ }
+ .write(TypedTsv[ReduceOutput](args("output")))
+ /*
val grobidSource = HBaseCrossrefScore.getHBaseSource(
args("hbase-table"),
args("zookeeper-hosts"))
@@ -34,7 +50,6 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
// This leads to a compile error:
val pipe11 : TypedPipe[String] = ScoreJob.getFeaturesPipe1(pipe0)
- /*
val pipe : cascading.pipe.Pipe = grobidSource
.read
val grobidPipe : TypedPipe[(String, String)] = pipe
@@ -46,22 +61,6 @@ class ScoreJob(args: Args) extends JobBase(args) { //with HBasePipeConversions {
// Let's try making a method call.
// ScoreJob.etFeaturesPipe(pipe)
- // TODO: Instantiate any subclass of Scorable specified in args.
- Scorable sc1 = new GrobidScorable()
- Scorable sc2 = new CrossrefScorable()
- val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(sc1.getSource().read)
- val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(sc2.getSource().read)
-
-
- pipe1.join(pipe2).map { entry =>
- val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
- new ReduceOutput(
- slug,
- Scorable.computeSimilarity(features1, features2),
- features1.json,
- features2.json)
- }
- .write(TypedTsv[ReduceOutput](args("output")))
*/
}
@@ -93,12 +92,25 @@ object ScoreJob {
}
}
+ /*
+ implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read)
+
+ // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields
+ implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe =
+ IterableSource[T](iter)(set, conv).read
+
+ implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe =
+ RichPipe(toPipe(iter)(set, conv))
+
+ // Provide args as an implicit val for extensions such as the Checkpoint extension.
+// implicit protected def _implicitJobArgs: Args = args
+
def getFeaturesPipe1(pipe : cascading.pipe.Pipe) : TypedPipe[String] = {
pipe
// The next line gives an error: value toTypedPipe is not a member of cascading.pipe.Pipe
.toTypedPipe[String](new Fields("line"))
}
-/*
+
def getFeaturesPipe(pipe : cascading.pipe.Pipe) : TypedPipe[MapFeatures] = {
pipe
.fromBytesWritable(new Fields("key", "tei_json"))