aboutsummaryrefslogtreecommitdiffstats
path: root/src/main/java/parallelai/spyglass/hbase/HBaseRawScheme.java
blob: a88581be5c67ead381e6fe8256c59017d9ad1eb9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/*
* Copyright (c) 2009 Concurrent, Inc.
*
* This work has been released into the public domain
* by the copyright holder. This applies worldwide.
*
* In case this is not legally possible:
* The copyright holder grants any entity the right
* to use this work for any purpose, without any
* conditions, unless such conditions are required by law.
*/

package parallelai.spyglass.hbase;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.twitter.elephantbird.mapred.input.DeprecatedInputFormatValueCopier;
import com.twitter.elephantbird.mapred.input.DeprecatedInputFormatWrapper;

import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.util.Util;

/**
 * It provides the wiring between Fields and Columns and Column families
 * In effect to write to cf:column
 *
 * data:name data:surname address: street
 *  name1      surname1       address1
 *
 * We will initialize the HBaseSource with
 *   ("data","data","data")
 *   ("name","surname","address")
 *   Data:
 *   ("name1","surname1","address1")
 *   ...
 *
 * The HBaseRawScheme class is a {@link Scheme} subclass. It is used in conjunction
 * with the {@HBaseRawTap} to allow for the reading and writing of data
 * to and from a HBase cluster.
 *
 * @see HBaseRawTap
 */
@SuppressWarnings({ "rawtypes", "deprecation" })
public class HBaseRawScheme extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> {
	/**
	 *
	 */
	private static final long serialVersionUID = 6248976486883281356L;

	/** Field LOG */
	private static final Logger LOG = LoggerFactory.getLogger(HBaseRawScheme.class);

	public final Fields RowKeyField = new Fields("rowkey");
	public final Fields RowField = new Fields("row");

	/** String familyNames */
	private String[] familyNames;

	private boolean writeNulls = true;

	/**
	 * Constructor HBaseScheme creates a new HBaseScheme instance.
	 * @param familyName
	 *            of type String
	 */
	public HBaseRawScheme(String familyName) {
		this(new String[] { familyName });
	}

	public HBaseRawScheme(String[] familyNames) {
		this.familyNames = familyNames;
		setSourceFields();
	}

	public HBaseRawScheme(String familyName, boolean writeNulls) {
		this(new String[] { familyName }, writeNulls);
	}

	public HBaseRawScheme(String[] familyNames, boolean writeNulls) {
		this.familyNames = familyNames;
		this.writeNulls = writeNulls;
		setSourceFields();
	}

	private void setSourceFields() {
		Fields sourceFields = Fields.join(RowKeyField, RowField);
		setSourceFields(sourceFields);
	}

	/**
	 * Method getFamilyNames returns the set of familyNames of this HBaseScheme
	 * object.
	 *
	 * @return the familyNames (type String[]) of this HBaseScheme object.
	 */
	public String[] getFamilyNames() {
		HashSet<String> familyNameSet = new HashSet<String>();
		if (familyNames != null) {
			for (String familyName : familyNames) {
				familyNameSet.add(familyName);
			}
		}
		return familyNameSet.toArray(new String[0]);
	}

	@Override
	public void sourcePrepare(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) {
		Object[] pair = new Object[] { sourceCall.getInput().createKey(), sourceCall.getInput().createValue() };

		sourceCall.setContext(pair);
	}

	@Override
	public void sourceCleanup(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) {
		sourceCall.setContext(null);
	}

	@SuppressWarnings("unchecked")
	@Override
	public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall)
			throws IOException {
		Tuple result = new Tuple();

		Object key = sourceCall.getContext()[0];
		Object value = sourceCall.getContext()[1];
		boolean hasNext = sourceCall.getInput().next(key, value);
		if (!hasNext) {
			return false;
		}

		// Skip nulls
		if (key == null || value == null) {
			return true;
		}

		ImmutableBytesWritable keyWritable = (ImmutableBytesWritable) key;
		Result row = (Result) value;
		result.add(keyWritable);
		result.add(row);
		sourceCall.getIncomingEntry().setTuple(result);
		return true;
	}

	@SuppressWarnings("unchecked")
	@Override
	public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
		TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
		OutputCollector outputCollector = sinkCall.getOutput();
		Tuple key = tupleEntry.selectTuple(RowKeyField);
		Object okey = key.getObject(0);
		ImmutableBytesWritable keyBytes = getBytes(okey);
		Put put = new Put(keyBytes.get());
		Fields outFields = tupleEntry.getFields().subtract(RowKeyField);
		if (null != outFields) {
			TupleEntry values = tupleEntry.selectEntry(outFields);
			for (int n = 0; n < values.getFields().size(); n++) {
				Object o = values.get(n);
				ImmutableBytesWritable valueBytes = getBytes(o);
				Comparable field = outFields.get(n);
				ColumnName cn = parseColumn((String) field);
				if (null == cn.family) {
					if (n >= familyNames.length)
						cn.family = familyNames[familyNames.length - 1];
					else
						cn.family = familyNames[n];
				}
				if (null != o || writeNulls)
					put.add(Bytes.toBytes(cn.family), Bytes.toBytes(cn.name), valueBytes.get());
			}
		}
		outputCollector.collect(null, put);
	}

	private ImmutableBytesWritable getBytes(Object obj) {
		if (null == obj)
			return new ImmutableBytesWritable(new byte[0]);
		if (obj instanceof ImmutableBytesWritable)
			return (ImmutableBytesWritable) obj;
		else if (obj instanceof String)
			return new ImmutableBytesWritable(Bytes.toBytes((String) obj));
		else if (obj instanceof Long)
			return new ImmutableBytesWritable(Bytes.toBytes((Long) obj));
		else if (obj instanceof Integer)
			return new ImmutableBytesWritable(Bytes.toBytes((Integer) obj));
		else if (obj instanceof Short)
			return new ImmutableBytesWritable(Bytes.toBytes((Short) obj));
		else if (obj instanceof Boolean)
			return new ImmutableBytesWritable(Bytes.toBytes((Boolean) obj));
		else if (obj instanceof Double)
			return new ImmutableBytesWritable(Bytes.toBytes((Double) obj));
		else
			throw new IllegalArgumentException("cannot convert object to ImmutableBytesWritable, class="
					+ obj.getClass().getName());
	}

	private ColumnName parseColumn(String column) {
		ColumnName ret = new ColumnName();
		int pos = column.indexOf(":");
		if (pos > 0) {
			ret.name = column.substring(pos + 1);
			ret.family = column.substring(0, pos);
		} else {
			ret.name = column;
		}
		return ret;
	}

	private class ColumnName {
		String family;
		String name;

		ColumnName() {
		}
	}

	@Override
	public void sinkConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {
		conf.setOutputFormat(TableOutputFormat.class);
		conf.setOutputKeyClass(ImmutableBytesWritable.class);
		conf.setOutputValueClass(Put.class);
	}

	@Override
	public void sourceConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap,
			JobConf conf) {

		DeprecatedInputFormatWrapper.setInputFormat(org.apache.hadoop.hbase.mapreduce.TableInputFormat.class, conf,
				ValueCopier.class);
		if (null != familyNames) {
			String columns = Util.join(this.familyNames, " ");
			LOG.debug("sourcing from column families: {}", columns);
			conf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN_COLUMNS, columns);
		}
	}

	@Override
	public boolean equals(Object object) {
		if (this == object) {
			return true;
		}
		if (object == null || getClass() != object.getClass()) {
			return false;
		}
		if (!super.equals(object)) {
			return false;
		}

		HBaseRawScheme that = (HBaseRawScheme) object;

		if (!Arrays.equals(familyNames, that.familyNames)) {
			return false;
		}
		return true;
	}

	@Override
	public int hashCode() {
		int result = super.hashCode();
		result = 31 * result + (familyNames != null ? Arrays.hashCode(familyNames) : 0);
		return result;
	}

	public static class ValueCopier implements DeprecatedInputFormatValueCopier<Result> {

		public ValueCopier() {
		}

		public void copyValue(Result oldValue, Result newValue) {
			if (null != oldValue && null != newValue) {
				oldValue.copyFrom(newValue);
			}
		}

	}
}