diff options
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala (renamed from scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala) | 10 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala (renamed from scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala) | 8 | 
2 files changed, 9 insertions, 9 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala index 0ce9167..7fd3ce0 100644 --- a/scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala +++ b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala @@ -15,19 +15,19 @@ import parallelai.spyglass.hbase.HBaseSource  // full CDX metadata, and dumps to a TSV for later extraction by the  // "extraction-ungrobided" job.  // -// Does the same horrible join thing that UnGrobidedDumpJob does. -class UnGrobidedDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions { +// Does the same horrible join thing that DumpUnGrobidedJob does. +class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions {    val output = args("output") -  val allKeys : TypedPipe[(String,String,String,String)] = UnGrobidedDumpJob.getHBaseKeySource( +  val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource(      args("hbase-table"),      args("zookeeper-hosts"))      .read      .fromBytesWritable('key, 'c, 'mime, 'cdx)      .toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx) -  val existingKeys : TypedPipe[(String,Boolean)] = UnGrobidedDumpJob.getHBaseColSource( +  val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource(      args("hbase-table"),      args("zookeeper-hosts"))      .read @@ -46,7 +46,7 @@ class UnGrobidedDumpJob(args: Args) extends JobBase(args) with HBasePipeConversi  } -object UnGrobidedDumpJob { +object DumpUnGrobidedJob {    // eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"    def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = { diff --git a/scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala index a847ebb..8dda5c8 100644 --- a/scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala @@ -17,7 +17,7 @@ import parallelai.spyglass.hbase.HBaseSource  import scala._  @RunWith(classOf[JUnitRunner]) -class UnGrobidedDumpJobTest extends FunSpec with TupleConversions { +class DumpUnGrobidedJobTest extends FunSpec with TupleConversions {    val output = "/tmp/testOutput"    val (testTable, testHost) = ("test-table", "dummy-host:2181") @@ -50,16 +50,16 @@ class UnGrobidedDumpJobTest extends FunSpec with TupleConversions {                          Bytes.toBytes(pair._3),                          Bytes.toBytes(pair._4))) -  JobTest("sandcrawler.UnGrobidedDumpJob") +  JobTest("sandcrawler.DumpUnGrobidedJob")      .arg("test", "")      .arg("app.conf.path", "app.conf")      .arg("output", output)      .arg("hbase-table", testTable)      .arg("zookeeper-hosts", testHost)      .arg("debug", "true") -    .source[Tuple](UnGrobidedDumpJob.getHBaseColSource(testTable, testHost), +    .source[Tuple](DumpUnGrobidedJob.getHBaseColSource(testTable, testHost),        sampleDataGrobid.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*))) -    .source[Tuple](UnGrobidedDumpJob.getHBaseKeySource(testTable, testHost), +    .source[Tuple](DumpUnGrobidedJob.getHBaseKeySource(testTable, testHost),        sampleDataFile.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))      .sink[Tuple](TypedTsv[(String,String,String,String)](output)) {        outputBuffer => | 
