You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by jw...@apache.org on 2012/08/20 09:31:49 UTC
git commit: CRUNCH-31: Crunch HBase example,
contributed by Gauthier Ambard.
Updated Branches:
refs/heads/master 60d6ca5de -> 012e924e6
CRUNCH-31: Crunch HBase example, contributed by Gauthier Ambard.
Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/012e924e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/012e924e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/012e924e
Branch: refs/heads/master
Commit: 012e924e6a2900065a7f9fefd2265136d65af2ee
Parents: 60d6ca5
Author: jwills <jw...@apache.org>
Authored: Mon Aug 20 00:30:26 2012 -0700
Committer: jwills <jw...@apache.org>
Committed: Mon Aug 20 00:30:26 2012 -0700
----------------------------------------------------------------------
crunch-examples/pom.xml | 5 +
.../crunch/examples/WordAggregationHBase.java | 250 +++++++++++++++
2 files changed, 255 insertions(+), 0 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/012e924e/crunch-examples/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/pom.xml b/crunch-examples/pom.xml
index 66465ca..c07207a 100644
--- a/crunch-examples/pom.xml
+++ b/crunch-examples/pom.xml
@@ -45,6 +45,11 @@ under the License.
<groupId>org.apache.crunch</groupId>
<artifactId>crunch</artifactId>
</dependency>
+
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-hbase</artifactId>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/012e924e/crunch-examples/src/main/java/org/apache/crunch/examples/WordAggregationHBase.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/WordAggregationHBase.java b/crunch-examples/src/main/java/org/apache/crunch/examples/WordAggregationHBase.java
new file mode 100644
index 0000000..680752f
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/WordAggregationHBase.java
@@ -0,0 +1,250 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.hbase.HBaseSourceTarget;
+import org.apache.crunch.io.hbase.HBaseTarget;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.MasterNotRunningException;
+import org.apache.hadoop.hbase.ZooKeeperConnectionException;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * You need to have a HBase instance running. Required dependencies : hbase /!\
+ * The version should be your version of hbase. <dependency>
+ * <groupId>org.apache.hbase</groupId> <artifactId>hbase</artifactId>
+ * <version>...</version> </dependency>
+ */
+@SuppressWarnings("serial")
+public class WordAggregationHBase extends Configured implements Tool, Serializable {
+ private final static Logger LOGGER = LoggerFactory.getLogger(WordAggregationHBase.class);
+
+ // Configuration parameters. Here configured for a hbase instance running
+ // locally
+ private static String HBASE_CONFIGURATION_ZOOKEEPER_QUORUM = "hbase.zookeeper.quorum";
+ private static String HBASE_CONFIGURATION_ZOOKEEPER_CLIENTPORT = "hbase.zookeeper.property.clientPort";
+ private static String hbaseZookeeperQuorum = "localhost";
+ private static String hbaseZookeeperClientPort = "2181";
+
+ // HBase parameters
+ private final String TABLE_SOURCE = "list";
+ private final String TABLE_TARGET = "aggregation";
+
+ private final byte[] COLUMN_FAMILY_SOURCE = Bytes.toBytes("content");
+ private final byte[] COLUMN_QUALIFIER_SOURCE_PLAY = Bytes.toBytes("play");
+ private final byte[] COLUMN_QUALIFIER_SOURCE_QUOTE = Bytes.toBytes("quote");
+
+ private final byte[] COLUMN_FAMILY_TARGET = Bytes.toBytes("aggregation");
+ private final byte[] COLUMN_QUALIFIER_TARGET_TEXT = Bytes.toBytes("text");
+
+ @Override
+ public int run(final String[] args) throws Exception {
+ // We create the test rows first
+ String type1 = "romeo and juliet";
+ String type2 = "macbeth";
+
+ String quote1 = "That which we call a rose By any other word would smell as sweet";
+ String quote2 = "But, soft! what light through yonder window breaks? It is the east, and Juliet is the sun.";
+ String quote3 = "But first, let me tell ye, if you should leadher in a fool's paradise, as they say,";
+ String quote4 = "Fair is foul, and foul is fair";
+ String quote5 = "But screw your courage to the sticking-place, And we'll not fail.";
+
+ String[] character = { "juliet", "romeo", "nurse", "witch", "macbeth" };
+ String[] type = { type1, type1, type1, type2, type2 };
+ String[] quote = { quote1, quote2, quote3, quote4, quote5 };
+
+ List<Put> putList = createPuts(Arrays.asList(character), Arrays.asList(type), Arrays.asList(quote));
+
+ // We create the tables and fill the source
+ Configuration configuration = getConf();
+
+ createTable(configuration, TABLE_SOURCE, Bytes.toString(COLUMN_FAMILY_SOURCE));
+ createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));
+
+ putInHbase(putList, configuration);
+
+ // We create the pipeline which will handle most of the job.
+ Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());
+
+ // The scan which will retrieve the data from the source in hbase.
+ Scan scan = new Scan();
+ scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
+ scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);
+
+ // Our hbase source
+ HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);
+
+ // Our source, in a format which can be use by crunch
+ PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);
+
+ // We process the data from the source HTable then concatenate all data
+ // with the same rowkey
+ PTable<String, String> textExtracted = extractText(rawText);
+ CombineFn<String, String> stringConcatCombine = CombineFn.STRING_CONCAT(" ", true);
+ PTable<String, String> result = textExtracted.groupByKey().combineValues(stringConcatCombine);
+
+ // We create the collection of puts from the concatenated datas
+ PCollection<Put> resultPut = createPut(result);
+
+ // We write the puts in hbase, in the target table
+ pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));
+
+ pipeline.done();
+ return 0;
+ }
+
+ /**
+ * Put the puts in HBase
+ *
+ * @param putList the puts
+ * @param conf the hbase configuration
+ * @throws IOException
+ */
+ private void putInHbase(final List<Put> putList, final Configuration conf) throws IOException {
+ HTable htable = new HTable(conf, TABLE_SOURCE);
+ try {
+ htable.put(putList);
+ } finally {
+ htable.close();
+ }
+ }
+
+ /**
+ * Create the table if they don't exist
+ *
+ * @param conf the hbase configuration
+ * @param htableName the table name
+ * @param families the column family names
+ * @throws MasterNotRunningException
+ * @throws ZooKeeperConnectionException
+ * @throws IOException
+ */
+ private void createTable(final Configuration conf, final String htableName, final String... families) throws MasterNotRunningException, ZooKeeperConnectionException,
+ IOException {
+ HBaseAdmin hbase = new HBaseAdmin(conf);
+ if (!hbase.tableExists(htableName)) {
+ HTableDescriptor desc = new HTableDescriptor(htableName);
+ for (String s : families) {
+ HColumnDescriptor meta = new HColumnDescriptor(s);
+ desc.addFamily(meta);
+ }
+ hbase.createTable(desc);
+ }
+ }
+
+ /**
+ * Create a list of puts
+ *
+ * @param character the rowkey
+ * @param play the play (in column COLUMN_QUALIFIER_SOURCE_PLAY)
+ * @param quote the quote (in column COLUMN_QUALIFIER_SOURCE_QUOTE)
+ * @return
+ */
+ private List<Put> createPuts(final List<String> character, final List<String> play, final List<String> quote) throws IllegalArgumentException {
+ List<Put> list = new ArrayList<Put>();
+ if (character.size() != play.size() || quote.size() != play.size()) {
+ LOGGER.error("Every list should have the same number of elements");
+ throw new IllegalArgumentException("Every list should have the same number of elements");
+ }
+ for (int i = 0; i < character.size(); i++) {
+ Put put = new Put(Bytes.toBytes(character.get(i)));
+ put.add(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY, Bytes.toBytes(play.get(i)));
+ put.add(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE, Bytes.toBytes(quote.get(i)));
+ list.add(put);
+ }
+ return list;
+ }
+
+ /**
+ * Extract information from hbase
+ *
+ * @param words the source from hbase
+ * @return a <code>PTable</code> composed of the type of the input as key
+ * and its def as value
+ */
+ public PTable<String, String> extractText(final PTable<ImmutableBytesWritable, Result> words) {
+ return words.parallelDo("Extract text", new DoFn<Pair<ImmutableBytesWritable, Result>, Pair<String, String>>() {
+ @Override
+ public void process(final Pair<ImmutableBytesWritable, Result> row, final Emitter<Pair<String, String>> emitter) {
+ byte[] type = row.second().getValue(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
+ byte[] def = row.second().getValue(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);
+ if (type != null && def != null) {
+ emitter.emit(new Pair<String, String>(Bytes.toString(type), Bytes.toString(def)));
+ }
+ }
+ }, Writables.tableOf(Writables.strings(), Writables.strings()));
+ }
+
+ /**
+ * Create puts in order to insert them in hbase.
+ *
+ * @param extractedText
+ * a PTable which contain the data in order to create the puts:
+ * keys of the PTable are rowkeys for the puts, values are the
+ * values for hbase.
+ * @return a PCollection formed by the puts.
+ */
+ public PCollection<Put> createPut(final PTable<String, String> extractedText) {
+ return extractedText.parallelDo("Convert to puts", new DoFn<Pair<String, String>, Put>() {
+ @Override
+ public void process(final Pair<String, String> input, final Emitter<Put> emitter) {
+ Put put;
+ put = new Put(Bytes.toBytes(input.first()));
+ put.add(COLUMN_FAMILY_TARGET, COLUMN_QUALIFIER_TARGET_TEXT, Bytes.toBytes(input.second()));
+ emitter.emit(put);
+ }
+ }, Writables.writables(Put.class));
+ }
+
+ public static void main(final String[] args) throws Exception {
+ Configuration conf = HBaseConfiguration.create();
+ // Configuration hbase
+ conf.set(HBASE_CONFIGURATION_ZOOKEEPER_QUORUM, hbaseZookeeperQuorum);
+ conf.set(HBASE_CONFIGURATION_ZOOKEEPER_CLIENTPORT, hbaseZookeeperClientPort);
+ ToolRunner.run(conf, new WordAggregationHBase(), args);
+ }
+}