aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala (renamed from scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala)10
-rw-r--r--scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala (renamed from scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala)8
2 files changed, 9 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
index 0ce9167..7fd3ce0 100644
--- a/scalding/src/main/scala/sandcrawler/UnGrobidedDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/DumpUnGrobidedJob.scala
@@ -15,19 +15,19 @@ import parallelai.spyglass.hbase.HBaseSource
// full CDX metadata, and dumps to a TSV for later extraction by the
// "extraction-ungrobided" job.
//
-// Does the same horrible join thing that UnGrobidedDumpJob does.
-class UnGrobidedDumpJob(args: Args) extends JobBase(args) with HBasePipeConversions {
+// Does the same horrible join thing that DumpUnGrobidedJob does.
+class DumpUnGrobidedJob(args: Args) extends JobBase(args) with HBasePipeConversions {
val output = args("output")
- val allKeys : TypedPipe[(String,String,String,String)] = UnGrobidedDumpJob.getHBaseKeySource(
+ val allKeys : TypedPipe[(String,String,String,String)] = DumpUnGrobidedJob.getHBaseKeySource(
args("hbase-table"),
args("zookeeper-hosts"))
.read
.fromBytesWritable('key, 'c, 'mime, 'cdx)
.toTypedPipe[(String,String,String,String)]('key, 'c, 'mime, 'cdx)
- val existingKeys : TypedPipe[(String,Boolean)] = UnGrobidedDumpJob.getHBaseColSource(
+ val existingKeys : TypedPipe[(String,Boolean)] = DumpUnGrobidedJob.getHBaseColSource(
args("hbase-table"),
args("zookeeper-hosts"))
.read
@@ -46,7 +46,7 @@ class UnGrobidedDumpJob(args: Args) extends JobBase(args) with HBasePipeConversi
}
-object UnGrobidedDumpJob {
+object DumpUnGrobidedJob {
// eg, "wbgrp-journal-extract-0-qa",7 "mtrcs-zk1.us.archive.org:2181"
def getHBaseColSource(hbaseTable: String, zookeeperHosts: String) : HBaseSource = {
diff --git a/scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
index a847ebb..8dda5c8 100644
--- a/scalding/src/test/scala/sandcrawler/UnGrobidedDumpJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/DumpUnGrobidedJobTest.scala
@@ -17,7 +17,7 @@ import parallelai.spyglass.hbase.HBaseSource
import scala._
@RunWith(classOf[JUnitRunner])
-class UnGrobidedDumpJobTest extends FunSpec with TupleConversions {
+class DumpUnGrobidedJobTest extends FunSpec with TupleConversions {
val output = "/tmp/testOutput"
val (testTable, testHost) = ("test-table", "dummy-host:2181")
@@ -50,16 +50,16 @@ class UnGrobidedDumpJobTest extends FunSpec with TupleConversions {
Bytes.toBytes(pair._3),
Bytes.toBytes(pair._4)))
- JobTest("sandcrawler.UnGrobidedDumpJob")
+ JobTest("sandcrawler.DumpUnGrobidedJob")
.arg("test", "")
.arg("app.conf.path", "app.conf")
.arg("output", output)
.arg("hbase-table", testTable)
.arg("zookeeper-hosts", testHost)
.arg("debug", "true")
- .source[Tuple](UnGrobidedDumpJob.getHBaseColSource(testTable, testHost),
+ .source[Tuple](DumpUnGrobidedJob.getHBaseColSource(testTable, testHost),
sampleDataGrobid.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
- .source[Tuple](UnGrobidedDumpJob.getHBaseKeySource(testTable, testHost),
+ .source[Tuple](DumpUnGrobidedJob.getHBaseKeySource(testTable, testHost),
sampleDataFile.map(l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*)))
.sink[Tuple](TypedTsv[(String,String,String,String)](output)) {
outputBuffer =>