1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
package parallelai.spyglass.hbase
import cascading.pipe.Pipe
import cascading.pipe.assembly.Coerce
import cascading.scheme.Scheme
import cascading.tap.{ Tap, SinkMode }
import cascading.tuple.Fields
import org.apache.hadoop.mapred.{ RecordReader, OutputCollector, JobConf }
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.JavaConversions._
import scala.collection.mutable.WrappedArray
import com.twitter.scalding._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
import org.apache.hadoop.hbase.util.Base64
import java.io.ByteArrayOutputStream
import java.io.DataOutputStream
object HBaseRawSource {
/**
* Converts a scan object to a base64 string that can be passed to HBaseRawSource
* @param scan
* @return base64 string representation
*/
def convertScanToString(scan: Scan) = {
val out = new ByteArrayOutputStream();
val dos = new DataOutputStream(out);
scan.write(dos);
Base64.encodeBytes(out.toByteArray());
}
}
/**
* @author Rotem Hermon
*
* HBaseRawSource is a scalding source that passes the original row (Result) object to the
* mapper for customized processing.
*
* @param tableName The name of the HBase table to read
* @param quorumNames HBase quorum
* @param familyNames Column families to get (source, if null will get all) or update to (sink)
* @param writeNulls Should the sink write null values. default = true. If false, null columns will not be written
* @param base64Scan An optional base64 encoded scan object
* @param sinkMode If REPLACE the output table will be deleted before writing to
*
*/
class HBaseRawSource(
tableName: String,
quorumNames: String = "localhost",
familyNames: Array[String],
writeNulls: Boolean = true,
base64Scan: String = null,
sinkMode: SinkMode = null) extends Source {
override val hdfsScheme = new HBaseRawScheme(familyNames, writeNulls)
.asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]]
override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = {
val hBaseScheme = hdfsScheme match {
case hbase: HBaseRawScheme => hbase
case _ => throw new ClassCastException("Failed casting from Scheme to HBaseRawScheme")
}
mode match {
case hdfsMode @ Hdfs(_, _) => readOrWrite match {
case Read => {
new HBaseRawTap(quorumNames, tableName, hBaseScheme, base64Scan, sinkMode match {
case null => SinkMode.KEEP
case _ => sinkMode
}).asInstanceOf[Tap[_, _, _]]
}
case Write => {
new HBaseRawTap(quorumNames, tableName, hBaseScheme, base64Scan, sinkMode match {
case null => SinkMode.UPDATE
case _ => sinkMode
}).asInstanceOf[Tap[_, _, _]]
}
}
case _ => super.createTap(readOrWrite)(mode)
}
}
}
|