You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2013/02/27 03:16:13 UTC
svn commit: r1450580 - in /hbase/trunk/hbase-server/src: main/java/org/apache/hadoop/hbase/mapreduce/ test/java/org/apache/hadoop/hbase/mapreduce/

Author: tedyu
Date: Wed Feb 27 02:16:13 2013
New Revision: 1450580

URL: http://svn.apache.org/r1450580
Log:
HBASE-7747 Import tools should use a combiner to merge Puts (Nick Dimiduk)


Added:
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/PutCombiner.java
Modified:
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/ImportTsv.java
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableMapReduceUtil.java
    hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestImportTsv.java
    hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableMapReduce.java

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/ImportTsv.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/ImportTsv.java?rev=1450580&r1=1450579&r2=1450580&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/ImportTsv.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/ImportTsv.java Wed Feb 27 02:16:13 2013
@@ -18,8 +18,6 @@
  */
 package org.apache.hadoop.hbase.mapreduce;
 
-import org.apache.hadoop.hbase.util.Base64;
-
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
@@ -37,6 +35,7 @@ import org.apache.hadoop.hbase.client.HB
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.Base64;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
@@ -283,6 +282,7 @@ public class ImportTsv {
       FileOutputFormat.setOutputPath(job, outputDir);
       job.setMapOutputKeyClass(ImmutableBytesWritable.class);
       job.setMapOutputValueClass(Put.class);
+      job.setCombinerClass(PutCombiner.class);
       HFileOutputFormat.configureIncrementalLoad(job, table);
     } else {
       // No reducers.  Just write straight to table.  Call initTableReducerJob

Added: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/PutCombiner.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/PutCombiner.java?rev=1450580&view=auto
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/PutCombiner.java (added)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/PutCombiner.java Wed Feb 27 02:16:13 2013
@@ -0,0 +1,72 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.mapreduce;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.mapreduce.Reducer;
+
+/**
+ * Combine Puts. Merges Put instances grouped by <code>K</code> into a single
+ * instance.
+ * @see TableMapReduceUtil
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class PutCombiner<K> extends Reducer<K, Put, K, Put> {
+  private static final Log LOG = LogFactory.getLog(PutCombiner.class);
+
+  @Override
+  protected void reduce(K row, Iterable<Put> vals, Context context)
+      throws IOException, InterruptedException {
+
+    int cnt = 0;
+    // There's nothing to say <code>K row</code> is the same as the rowkey
+    // used to construct Puts (value) instances. Thus the map of put.getRow()
+    // to combined Put is necessary.
+    // TODO: would be better if we knew <code>K row</code> and Put rowkey were
+    // identical. Then this whole Put buffering business goes away.
+    // TODO: Could use HeapSize to create an upper bound on the memory size of
+    // the puts map and flush some portion of the content while looping. This
+    // flush could result in multiple Puts for a single rowkey. That is
+    // acceptable because Combiner is run as an optimization and it's not
+    // critical that all Puts are grouped perfectly.
+    Map<byte[], Put> puts = new HashMap<byte[], Put>();
+    for (Put p : vals) {
+      cnt++;
+      if (!puts.containsKey(p.getRow())) {
+        puts.put(p.getRow(), p);
+      } else {
+        puts.get(p.getRow()).getFamilyMap().putAll(p.getFamilyMap());
+      }
+    }
+
+    for (Put p : puts.values()) {
+      context.write(row, p);
+    }
+    LOG.info(String.format("Combined %d Put(s) into %d.", cnt, puts.size()));
+  }
+}

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableMapReduceUtil.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableMapReduceUtil.java?rev=1450580&r1=1450579&r2=1450580&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableMapReduceUtil.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableMapReduceUtil.java Wed Feb 27 02:16:13 2013
@@ -38,6 +38,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.catalog.MetaReader;
+import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
@@ -133,6 +134,9 @@ public class TableMapReduceUtil {
     if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
     if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
     job.setMapperClass(mapper);
+    if (Put.class.equals(outputValueClass)) {
+      job.setCombinerClass(PutCombiner.class);
+    }
     Configuration conf = job.getConfiguration();
     HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
     conf.set(TableInputFormat.INPUT_TABLE, table);

Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestImportTsv.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestImportTsv.java?rev=1450580&r1=1450579&r2=1450580&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestImportTsv.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestImportTsv.java Wed Feb 27 02:16:13 2013
@@ -289,6 +289,8 @@ public class TestImportTsv {
         LOG.info("set the hbaseAdmin");
         ImportTsv.createHbaseAdmin(conf);
       }
+      // force use of combiner for testing purposes
+      conf.setInt("min.num.spills.for.combine", 1);
       Job job = ImportTsv.createSubmittableJob(conf, args);
       job.waitForCompletion(false);
       assertTrue(job.isSuccessful());

Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableMapReduce.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableMapReduce.java?rev=1450580&r1=1450579&r2=1450580&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableMapReduce.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableMapReduce.java Wed Feb 27 02:16:13 2013
@@ -29,7 +29,9 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hbase.*;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.LargeTests;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Result;
@@ -128,6 +130,15 @@ public class TestTableMapReduce {
       MULTI_REGION_TABLE_NAME));
   }
 
+  @Test
+  public void testCombiner()
+      throws IOException, InterruptedException, ClassNotFoundException {
+    Configuration conf = new Configuration(UTIL.getConfiguration());
+    // force use of combiner for testing purposes
+    conf.setInt("min.num.spills.for.combine", 1);
+    runTestOnTable(new HTable(conf, MULTI_REGION_TABLE_NAME));
+  }
+
   private void runTestOnTable(HTable table)
   throws IOException, InterruptedException, ClassNotFoundException {
     Job job = null;