You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by vj...@apache.org on 2023/06/16 17:08:20 UTC
[hbase] branch branch-3 updated: HBASE-27904: A random data generator tool leveraging hbase bulk load (#5280)

This is an automated email from the ASF dual-hosted git repository.

vjasani pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-3 by this push:
     new 5231991a3a7 HBASE-27904: A random data generator tool leveraging hbase bulk load (#5280)
5231991a3a7 is described below

commit 5231991a3a7a9c863e9ae678003e9b0d1d37a355
Author: Himanshu Gwalani <hg...@gmail.com>
AuthorDate: Fri Jun 16 22:32:00 2023 +0530

    HBASE-27904: A random data generator tool leveraging hbase bulk load (#5280)
    
    Signed-off-by: Viraj Jasani <vj...@apache.org>
---
 .../BulkDataGeneratorInputFormat.java              |  87 ++++++
 .../bulkdatagenerator/BulkDataGeneratorMapper.java | 138 ++++++++++
 .../BulkDataGeneratorRecordReader.java             |  75 +++++
 .../bulkdatagenerator/BulkDataGeneratorTool.java   | 301 +++++++++++++++++++++
 .../hbase/util/bulkdatagenerator/Utility.java      | 102 +++++++
 .../_chapters/bulk_data_generator_tool.adoc        | 132 +++++++++
 6 files changed, 835 insertions(+)

diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorInputFormat.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorInputFormat.java
new file mode 100644
index 00000000000..f40951e945d
--- /dev/null
+++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorInputFormat.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util.bulkdatagenerator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
+
+public class BulkDataGeneratorInputFormat extends InputFormat<Text, NullWritable> {
+
+  public static final String MAPPER_TASK_COUNT_KEY =
+    BulkDataGeneratorInputFormat.class.getName() + "mapper.task.count";
+
+  @Override
+  public List<InputSplit> getSplits(JobContext job) throws IOException {
+    // Get the number of mapper tasks configured
+    int mapperCount = job.getConfiguration().getInt(MAPPER_TASK_COUNT_KEY, -1);
+    Preconditions.checkArgument(mapperCount > 1, MAPPER_TASK_COUNT_KEY + " is not set.");
+
+    // Create a number of input splits equal to the number of mapper tasks
+    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
+    for (int i = 0; i < mapperCount; ++i) {
+      splits.add(new FakeInputSplit());
+    }
+    return splits;
+  }
+
+  @Override
+  public RecordReader<Text, NullWritable> createRecordReader(InputSplit split,
+    TaskAttemptContext context) throws IOException, InterruptedException {
+    BulkDataGeneratorRecordReader bulkDataGeneratorRecordReader =
+      new BulkDataGeneratorRecordReader();
+    bulkDataGeneratorRecordReader.initialize(split, context);
+    return bulkDataGeneratorRecordReader;
+  }
+
+  /**
+   * Dummy input split to be used by {@link BulkDataGeneratorRecordReader}
+   */
+  private static class FakeInputSplit extends InputSplit implements Writable {
+
+    @Override
+    public void readFields(DataInput arg0) throws IOException {
+    }
+
+    @Override
+    public void write(DataOutput arg0) throws IOException {
+    }
+
+    @Override
+    public long getLength() throws IOException, InterruptedException {
+      return 0;
+    }
+
+    @Override
+    public String[] getLocations() throws IOException, InterruptedException {
+      return new String[0];
+    }
+  }
+}
diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorMapper.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorMapper.java
new file mode 100644
index 00000000000..35f8b9c471e
--- /dev/null
+++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorMapper.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util.bulkdatagenerator;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import org.apache.commons.math3.util.Pair;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
+import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
+
+public class BulkDataGeneratorMapper
+  extends Mapper<Text, NullWritable, ImmutableBytesWritable, KeyValue> {
+
+  /** Counter enumeration to count number of rows generated. */
+  public static enum Counters {
+    ROWS_GENERATED
+  }
+
+  public static final String SPLIT_COUNT_KEY =
+    BulkDataGeneratorMapper.class.getName() + "split.count";
+
+  private static final String ORG_ID = "00D000000000062";
+  private static final int MAX_EVENT_ID = Integer.MAX_VALUE;
+  private static final int MAX_VEHICLE_ID = 100;
+  private static final int MAX_SPEED_KPH = 140;
+  private static final int NUM_LOCATIONS = 10;
+  private static int splitCount = 1;
+  private static final Random random = new Random(System.currentTimeMillis());
+  private static final Map<String, Pair<BigDecimal, BigDecimal>> LOCATIONS =
+    Maps.newHashMapWithExpectedSize(NUM_LOCATIONS);
+  private static final List<String> LOCATION_KEYS = Lists.newArrayListWithCapacity(NUM_LOCATIONS);
+  static {
+    LOCATIONS.put("Belém", new Pair<>(BigDecimal.valueOf(-01.45), BigDecimal.valueOf(-48.48)));
+    LOCATIONS.put("Brasília", new Pair<>(BigDecimal.valueOf(-15.78), BigDecimal.valueOf(-47.92)));
+    LOCATIONS.put("Campinas", new Pair<>(BigDecimal.valueOf(-22.90), BigDecimal.valueOf(-47.05)));
+    LOCATIONS.put("Cuiaba", new Pair<>(BigDecimal.valueOf(-07.25), BigDecimal.valueOf(-58.42)));
+    LOCATIONS.put("Manaus", new Pair<>(BigDecimal.valueOf(-03.10), BigDecimal.valueOf(-60.00)));
+    LOCATIONS.put("Porto Velho",
+      new Pair<>(BigDecimal.valueOf(-08.75), BigDecimal.valueOf(-63.90)));
+    LOCATIONS.put("Recife", new Pair<>(BigDecimal.valueOf(-08.10), BigDecimal.valueOf(-34.88)));
+    LOCATIONS.put("Rio de Janeiro",
+      new Pair<>(BigDecimal.valueOf(-22.90), BigDecimal.valueOf(-43.23)));
+    LOCATIONS.put("Santarém", new Pair<>(BigDecimal.valueOf(-02.43), BigDecimal.valueOf(-54.68)));
+    LOCATIONS.put("São Paulo", new Pair<>(BigDecimal.valueOf(-23.53), BigDecimal.valueOf(-46.62)));
+    LOCATION_KEYS.addAll(LOCATIONS.keySet());
+  }
+
+  final static byte[] COLUMN_FAMILY_BYTES = Utility.COLUMN_FAMILY.getBytes();
+
+  /** {@inheritDoc} */
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    Configuration c = context.getConfiguration();
+    splitCount = c.getInt(SPLIT_COUNT_KEY, 1);
+  }
+
+  /**
+   * Generates a single record based on value set to the key by
+   * {@link BulkDataGeneratorRecordReader#getCurrentKey()}.
+   * {@link Utility.TableColumnNames#TOOL_EVENT_ID} is first part of row key. Keeping first
+   * {@link Utility#SPLIT_PREFIX_LENGTH} characters as index of the record to be generated ensures
+   * that records are equally distributed across all regions of the table since region boundaries
+   * are generated in similar fashion. Check {@link Utility#createTable(Admin, String, int, Map)}
+   * method for region split info.
+   * @param key     - The key having index of next record to be generated
+   * @param value   - Value associated with the key (not used)
+   * @param context - Context of the mapper container
+   */
+  @Override
+  protected void map(Text key, NullWritable value, Context context)
+    throws IOException, InterruptedException {
+
+    int recordIndex = Integer.parseInt(key.toString());
+
+    // <6-characters-region-boundary-prefix>_<15-random-chars>_<record-index-for-this-mapper-task>
+    final String toolEventId =
+      String.format("%0" + Utility.SPLIT_PREFIX_LENGTH + "d", recordIndex % (splitCount + 1)) + "_"
+        + EnvironmentEdgeManager.currentTime() + (1e14 + (random.nextFloat() * 9e13)) + "_"
+        + recordIndex;
+    final String eventId = String.valueOf(Math.abs(random.nextInt(MAX_EVENT_ID)));
+    final String vechileId = String.valueOf(Math.abs(random.nextInt(MAX_VEHICLE_ID)));
+    final String speed = String.valueOf(Math.abs(random.nextInt(MAX_SPEED_KPH)));
+    final String location = LOCATION_KEYS.get(random.nextInt(NUM_LOCATIONS));
+    final Pair<BigDecimal, BigDecimal> coordinates = LOCATIONS.get(location);
+    final BigDecimal latitude = coordinates.getFirst();
+    final BigDecimal longitude = coordinates.getSecond();
+
+    final ImmutableBytesWritable hKey =
+      new ImmutableBytesWritable(String.format("%s:%s", toolEventId, ORG_ID).getBytes());
+    addKeyValue(context, hKey, Utility.TableColumnNames.ORG_ID, ORG_ID);
+    addKeyValue(context, hKey, Utility.TableColumnNames.TOOL_EVENT_ID, toolEventId);
+    addKeyValue(context, hKey, Utility.TableColumnNames.EVENT_ID, eventId);
+    addKeyValue(context, hKey, Utility.TableColumnNames.VEHICLE_ID, vechileId);
+    addKeyValue(context, hKey, Utility.TableColumnNames.SPEED, speed);
+    addKeyValue(context, hKey, Utility.TableColumnNames.LATITUDE, latitude.toString());
+    addKeyValue(context, hKey, Utility.TableColumnNames.LONGITUDE, longitude.toString());
+    addKeyValue(context, hKey, Utility.TableColumnNames.LOCATION, location);
+    addKeyValue(context, hKey, Utility.TableColumnNames.TIMESTAMP,
+      String.valueOf(EnvironmentEdgeManager.currentTime()));
+
+    context.getCounter(Counters.ROWS_GENERATED).increment(1);
+  }
+
+  private void addKeyValue(final Context context, ImmutableBytesWritable key,
+    final Utility.TableColumnNames columnName, final String value)
+    throws IOException, InterruptedException {
+    KeyValue kv =
+      new KeyValue(key.get(), COLUMN_FAMILY_BYTES, columnName.getColumnName(), value.getBytes());
+    context.write(key, kv);
+  }
+}
diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorRecordReader.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorRecordReader.java
new file mode 100644
index 00000000000..f4ecc659e51
--- /dev/null
+++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorRecordReader.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util.bulkdatagenerator;
+
+import java.io.IOException;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
+
+public class BulkDataGeneratorRecordReader extends RecordReader<Text, NullWritable> {
+
+  private int numRecordsToCreate = 0;
+  private int createdRecords = 0;
+  private Text key = new Text();
+  private NullWritable value = NullWritable.get();
+
+  public static final String RECORDS_PER_MAPPER_TASK_KEY =
+    BulkDataGeneratorInputFormat.class.getName() + "records.per.mapper.task";
+
+  @Override
+  public void initialize(InputSplit split, TaskAttemptContext context)
+    throws IOException, InterruptedException {
+    // Get the number of records to create from the configuration
+    this.numRecordsToCreate = context.getConfiguration().getInt(RECORDS_PER_MAPPER_TASK_KEY, -1);
+    Preconditions.checkArgument(numRecordsToCreate > 0,
+      "Number of records to be created by per mapper should be greater than 0.");
+  }
+
+  @Override
+  public boolean nextKeyValue() {
+    createdRecords++;
+    return createdRecords <= numRecordsToCreate;
+  }
+
+  @Override
+  public Text getCurrentKey() {
+    // Set the index of record to be created
+    key.set(String.valueOf(createdRecords));
+    return key;
+  }
+
+  @Override
+  public NullWritable getCurrentValue() {
+    return value;
+  }
+
+  @Override
+  public float getProgress() throws IOException, InterruptedException {
+    return (float) createdRecords / (float) numRecordsToCreate;
+  }
+
+  @Override
+  public void close() throws IOException {
+
+  }
+}
diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorTool.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorTool.java
new file mode 100644
index 00000000000..befa1486dec
--- /dev/null
+++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/BulkDataGeneratorTool.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util.bulkdatagenerator;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
+import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
+import org.apache.hadoop.hbase.tool.BulkLoadHFiles;
+import org.apache.hadoop.hbase.tool.BulkLoadHFilesTool;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.GnuParser;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.Option;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.Options;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException;
+import org.apache.hbase.thirdparty.org.apache.commons.cli.Parser;
+
+/**
+ * A command line utility to generate pre-splitted HBase Tables with large amount (TBs) of random
+ * data, equally distributed among all regions.
+ */
+public class BulkDataGeneratorTool {
+
+  private static final Logger logger = LoggerFactory.getLogger(BulkDataGeneratorTool.class);
+
+  /**
+   * Prefix for the generated HFiles directory
+   */
+  private static final String OUTPUT_DIRECTORY_PREFIX = "/bulk_data_generator/";
+
+  /**
+   * Number of mapper container to be launched for generating of HFiles
+   */
+  private int mapperCount;
+
+  /**
+   * Number of rows to be generated by each mapper
+   */
+  private long rowsPerMapper;
+
+  /**
+   * Table for which random data needs to be generated
+   */
+  private String table;
+
+  /**
+   * Number of splits for the {@link #table}. Number of regions for the table will be
+   * ({@link #splitCount} + 1).
+   */
+  private int splitCount;
+
+  /**
+   * Flag to delete the table (before creating) if it already exists
+   */
+  private boolean deleteTableIfExist;
+
+  /**
+   * Additional HBase meta-data options to be set for the table
+   */
+  private final Map<String, String> tableOptions = new HashMap<>();
+
+  public static void main(String[] args) throws Exception {
+    Configuration conf = HBaseConfiguration.create();
+    BulkDataGeneratorTool bulkDataGeneratorTool = new BulkDataGeneratorTool();
+    bulkDataGeneratorTool.run(conf, args);
+  }
+
+  public boolean run(Configuration conf, String[] args) throws IOException {
+    // Read CLI arguments
+    CommandLine line = null;
+    try {
+      Parser parser = new GnuParser();
+      line = parser.parse(getOptions(), args);
+      readCommandLineParameters(conf, line);
+    } catch (ParseException | IOException exception) {
+      logger.error("Error while parsing CLI arguments.", exception);
+      printUsage();
+      return false;
+    }
+
+    if (line.hasOption("-h")) {
+      printUsage();
+      return true;
+    }
+
+    Preconditions.checkArgument(!StringUtils.isEmpty(table), "Table name must not be empty");
+    Preconditions.checkArgument(mapperCount > 0, "Mapper count must be greater than 0");
+    Preconditions.checkArgument((splitCount > 0) && (splitCount < Utility.MAX_SPLIT_COUNT),
+      "Split count must be greater than 0 and less than " + Utility.MAX_SPLIT_COUNT);
+    Preconditions.checkArgument(rowsPerMapper > 0, "Rows per mapper must be greater than 0");
+
+    Path outputDirectory = generateOutputDirectory();
+    logger.info("HFiles will be generated at " + outputDirectory.toString());
+
+    try (Connection connection = ConnectionFactory.createConnection(conf)) {
+      final Admin admin = connection.getAdmin();
+      final TableName tableName = TableName.valueOf(table);
+      if (admin.tableExists(tableName)) {
+        if (deleteTableIfExist) {
+          logger.info(
+            "Deleting the table since it already exist and delete-if-exist flag is set to true");
+          Utility.deleteTable(admin, table);
+        } else {
+          logger.info("Table already exists, cannot generate HFiles for existing table.");
+          return false;
+        }
+      }
+
+      // Creating the pre-split table
+      Utility.createTable(admin, table, splitCount, tableOptions);
+      logger.info(table + " created successfully");
+
+      Job job = createSubmittableJob(conf);
+
+      Table hbaseTable = connection.getTable(tableName);
+
+      // Auto configure partitioner and reducer
+      HFileOutputFormat2.configureIncrementalLoad(job, hbaseTable, hbaseTable.getRegionLocator());
+
+      FileOutputFormat.setOutputPath(job, outputDirectory);
+
+      boolean result = job.waitForCompletion(true);
+
+      if (result) {
+        logger.info("HFiles generated successfully. Starting bulk load to " + table);
+        BulkLoadHFilesTool bulkLoadHFilesTool = new BulkLoadHFilesTool(conf);
+        Map<BulkLoadHFiles.LoadQueueItem, ByteBuffer> bulkLoadedHFiles =
+          bulkLoadHFilesTool.bulkLoad(tableName, outputDirectory);
+        boolean status = !bulkLoadedHFiles.isEmpty();
+        logger.info("BulkLoadHFiles finished successfully with status " + status);
+        return status;
+      } else {
+        logger.info("Failed to generate HFiles.");
+        return false;
+      }
+    } catch (Exception e) {
+      logger.error("Failed to generate data", e);
+      return false;
+    } finally {
+      FileSystem.get(conf).deleteOnExit(outputDirectory);
+    }
+  }
+
+  protected Job createSubmittableJob(Configuration conf) throws IOException {
+
+    conf.setInt(BulkDataGeneratorMapper.SPLIT_COUNT_KEY, splitCount);
+    conf.setInt(BulkDataGeneratorInputFormat.MAPPER_TASK_COUNT_KEY, mapperCount);
+    conf.setLong(BulkDataGeneratorRecordReader.RECORDS_PER_MAPPER_TASK_KEY, rowsPerMapper);
+
+    Job job = new Job(conf, BulkDataGeneratorTool.class.getSimpleName() + " - " + table);
+
+    job.setJarByClass(BulkDataGeneratorMapper.class);
+    job.setInputFormatClass(BulkDataGeneratorInputFormat.class);
+
+    HBaseConfiguration.addHbaseResources(conf);
+
+    job.setMapperClass(BulkDataGeneratorMapper.class);
+    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
+    job.setMapOutputValueClass(KeyValue.class);
+
+    return job;
+  }
+
+  /** Returns Random output directory path where HFiles will be generated */
+  protected Path generateOutputDirectory() {
+    final String outputDirectory =
+      OUTPUT_DIRECTORY_PREFIX + "/" + table + "-" + System.currentTimeMillis();
+    return new Path(outputDirectory);
+  }
+
+  /**
+   * This method parses the command line parameters into instance variables
+   */
+  protected void readCommandLineParameters(Configuration conf, CommandLine line)
+    throws ParseException, IOException {
+    final List<String> genericParameters = new ArrayList<String>();
+
+    // Parse the generic options
+    for (Map.Entry<Object, Object> entry : line.getOptionProperties("D").entrySet()) {
+      genericParameters.add("-D");
+      genericParameters.add(entry.getKey() + "=" + entry.getValue());
+    }
+
+    logger.info(
+      "Parsed generic parameters: " + Arrays.toString(genericParameters.toArray(new String[0])));
+
+    new GenericOptionsParser(conf, genericParameters.toArray(new String[0]));
+
+    table = line.getOptionValue("table");
+
+    if (line.hasOption("mapper-count")) {
+      mapperCount = Integer.parseInt(line.getOptionValue("mapper-count"));
+    }
+    if (line.hasOption("split-count")) {
+      splitCount = Integer.parseInt(line.getOptionValue("split-count"));
+    }
+    if (line.hasOption("rows-per-mapper")) {
+      rowsPerMapper = Long.parseLong(line.getOptionValue("rows-per-mapper"));
+    }
+
+    deleteTableIfExist = line.hasOption("delete-if-exist");
+
+    parseTableOptions(line);
+  }
+
+  private void parseTableOptions(final CommandLine line) {
+    final String tableOptionsAsString = line.getOptionValue("table-options");
+    if (!StringUtils.isEmpty(tableOptionsAsString)) {
+      for (String tableOption : tableOptionsAsString.split(",")) {
+        final String[] keyValueSplit = tableOption.split("=");
+        final String key = keyValueSplit[0];
+        final String value = keyValueSplit[1];
+        tableOptions.put(key, value);
+      }
+    }
+  }
+
+  /** Returns the command line option for {@link BulkDataGeneratorTool} */
+  protected Options getOptions() {
+    final Options options = new Options();
+    Option option =
+      new Option("t", "table", true, "The table name for which data need to be generated.");
+    options.addOption(option);
+
+    option = new Option("d", "delete-if-exist", false,
+      "If it's set, the table will be deleted if already exist.");
+    options.addOption(option);
+
+    option =
+      new Option("mc", "mapper-count", true, "The number of mapper containers to be launched.");
+    options.addOption(option);
+
+    option = new Option("sc", "split-count", true,
+      "The number of regions/pre-splits to be created for the table.");
+    options.addOption(option);
+
+    option =
+      new Option("r", "rows-per-mapper", true, "The number of rows to be generated PER mapper.");
+    options.addOption(option);
+
+    option =
+      new Option("o", "table-options", true, "Table options to be set while creating the table.");
+    options.addOption(option);
+
+    option = new Option("h", "help", false, "Show help message for the tool");
+    options.addOption(option);
+
+    return options;
+  }
+
+  protected void printUsage() {
+    final HelpFormatter helpFormatter = new HelpFormatter();
+    helpFormatter.setWidth(120);
+    final String helpMessageCommand = "hbase " + BulkDataGeneratorTool.class.getName();
+    final String commandSyntax = helpMessageCommand + " <OPTIONS> [-D<property=value>]*";
+    final String helpMessageSuffix = "Examples:\n" + helpMessageCommand
+      + " -t TEST_TABLE -mc 10 -r 100 -sc 10\n" + helpMessageCommand
+      + " -t TEST_TABLE -mc 10 -r 100 -sc 10 -d -o \"BACKUP=false,NORMALIZATION_ENABLED=false\"\n"
+      + helpMessageCommand + " -t TEST_TABLE -mc 10 -r 100 -sc 10 -Dmapreduce.map.memory.mb=8192\n";
+    helpFormatter.printHelp(commandSyntax, "", getOptions(), helpMessageSuffix);
+  }
+}
diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/Utility.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/Utility.java
new file mode 100644
index 00000000000..3db75239a64
--- /dev/null
+++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/util/bulkdatagenerator/Utility.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util.bulkdatagenerator;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+
+import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
+
+public final class Utility {
+
+  /**
+   * Schema for HBase table to be generated by generated and populated by
+   * {@link BulkDataGeneratorTool}
+   */
+  public enum TableColumnNames {
+    ORG_ID("orgId".getBytes()),
+    TOOL_EVENT_ID("toolEventId".getBytes()),
+    EVENT_ID("eventId".getBytes()),
+    VEHICLE_ID("vehicleId".getBytes()),
+    SPEED("speed".getBytes()),
+    LATITUDE("latitude".getBytes()),
+    LONGITUDE("longitude".getBytes()),
+    LOCATION("location".getBytes()),
+    TIMESTAMP("timestamp".getBytes());
+
+    private final byte[] columnName;
+
+    TableColumnNames(byte[] column) {
+      this.columnName = column;
+    }
+
+    public byte[] getColumnName() {
+      return this.columnName;
+    }
+  }
+
+  public static final String COLUMN_FAMILY = "cf";
+
+  public static final int SPLIT_PREFIX_LENGTH = 6;
+
+  public static final int MAX_SPLIT_COUNT = (int) Math.pow(10, SPLIT_PREFIX_LENGTH);
+
+  /**
+   * Private Constructor
+   */
+  private Utility() {
+
+  }
+
+  public static void deleteTable(Admin admin, String tableName) throws IOException {
+    admin.disableTable(TableName.valueOf(tableName));
+    admin.deleteTable(TableName.valueOf(tableName));
+  }
+
+  /**
+   * Creates a pre-splitted HBase Table having single column family ({@link #COLUMN_FAMILY}) and
+   * sequential splits with {@link #SPLIT_PREFIX_LENGTH} length character prefix. Example: If a
+   * table (TEST_TABLE_1) need to be generated with splitCount as 10, table would be created with
+   * (10+1) regions with boundaries end-keys as (000000-000001, 000001-000002, 000002-000003, ....,
+   * 0000010-)
+   * @param admin        - Admin object associated with HBase connection
+   * @param tableName    - Name of table to be created
+   * @param splitCount   - Number of splits for the table (Number of regions will be splitCount + 1)
+   * @param tableOptions - Additional HBase metadata properties to be set for the table
+   */
+  public static void createTable(Admin admin, String tableName, int splitCount,
+    Map<String, String> tableOptions) throws IOException {
+    Preconditions.checkArgument(splitCount > 0, "Split count must be greater than 0");
+    TableDescriptorBuilder tableDescriptorBuilder =
+      TableDescriptorBuilder.newBuilder(TableName.valueOf(tableName));
+    tableOptions.forEach(tableDescriptorBuilder::setValue);
+    TableDescriptor tableDescriptor = tableDescriptorBuilder
+      .setColumnFamily(ColumnFamilyDescriptorBuilder.of(COLUMN_FAMILY)).build();
+    // Pre-splitting table based on splitCount
+    byte[][] splitKeys = new byte[splitCount][];
+    for (int i = 0; i < splitCount; i++) {
+      splitKeys[i] = String.format("%0" + Utility.SPLIT_PREFIX_LENGTH + "d", i + 1).getBytes();
+    }
+    admin.createTable(tableDescriptor, splitKeys);
+  }
+}
diff --git a/src/main/asciidoc/_chapters/bulk_data_generator_tool.adoc b/src/main/asciidoc/_chapters/bulk_data_generator_tool.adoc
new file mode 100644
index 00000000000..3ac6ca69312
--- /dev/null
+++ b/src/main/asciidoc/_chapters/bulk_data_generator_tool.adoc
@@ -0,0 +1,132 @@
+////
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+////
+
+== Bulk Data Generator Tool
+:doctype: book
+:numbered:
+:toc: left
+:icons: font
+:experimental:
+
+This is a random data generator tool for HBase tables leveraging Hbase bulk load.
+It can create pre-splited HBase table and the generated data is *uniformly distributed* to all the regions of the table.
+
+=== How to Use
+
+[source]
+----
+usage: hbase org.apache.hadoop.hbase.util.bulkdatagenerator.BulkDataGeneratorTool <OPTIONS> [-D<property=value>]*
+ -d,--delete-if-exist         If it's set, the table will be deleted if already exist.
+ -h,--help                    Show help message for the tool
+ -mc,--mapper-count <arg>     The number of mapper containers to be launched.
+ -o,--table-options <arg>     Table options to be set while creating the table.
+ -r,--rows-per-mapper <arg>   The number of rows to be generated PER mapper.
+ -sc,--split-count <arg>      The number of regions/pre-splits to be created for the table.
+ -t,--table <arg>             The table name for which data need to be generated.
+----
+
+----
+Examples:
+
+hbase org.apache.hadoop.hbase.util.bulkdatagenerator.BulkDataGeneratorTool -t TEST_TABLE -mc 10 -r 100 -sc 10
+
+hbase org.apache.hadoop.hbase.util.bulkdatagenerator.BulkDataGeneratorTool -t TEST_TABLE -mc 10 -r 100 -sc 10 -d -o "BACKUP=false,NORMALIZATION_ENABLED=false"
+
+hbase org.apache.hadoop.hbase.util.bulkdatagenerator.BulkDataGeneratorTool -t TEST_TABLE -mc 10 -r 100 -sc 10 -Dmapreduce.map.memory.mb=8192
+----
+
+=== How it Works
+
+==== Table Schema
+Tool generates a HBase table with single column family, i.e. *cf* and 9 columns i.e.
+----
+ORG_ID, TOOL_EVENT_ID, EVENT_ID, VEHICLE_ID, SPEED, LATITUDE, LONGITUDE, LOCATION, TIMESTAMP
+----
+with row key as
+----
+<TOOL_EVENT_ID>:<ORGANIZATION_ID>
+----
+
+==== Table Creation
+Tool creates a pre-splited HBase Table having "*split-count*" splits (i.e. *split-count* + 1 regions) with sequential 6 digit region boundary prefix.
+Example: If a table is generated with "*split-count*" as 10, it will have (10+1) regions with following start-end keys.
+----
+(-000001, 000001-000002, 000002-000003, ...., 000009-000010, 0000010-)
+----
+
+==== Data Generation
+Tool creates and run a MR job to generate the HFiles, which are bulk loaded to table regions via `org.apache.hadoop.hbase.tool.BulkLoadHFilesTool`.
+The number of mappers is defined in input as "*mapper-count*". Each mapper generates "*records-per-mapper*" rows.
+
+`org.apache.hadoop.hbase.util.bulkdatageneratorBulkDataGeneratorRecordReader` ensures that each record generated by mapper is associated with index (added to the key) ranging from 1 to "*records-per-mapper*".
+
+The TOOL_EVENT_ID column for each row has a 6 digit prefix as
+----
+(index) mod ("split-count" + 1)
+----
+Example, if 10 records are to be generated by each mapper and "*split-count*" is 4, the TOOL_EVENT_IDs for each record will have a prefix as
+[options="header"]
+|===
+|Record Index|TOOL_EVENT_ID's first six characters
+//----------------------
+|1|000001
+|2|000002
+|3|000003
+|4|000004
+|5|000000
+|6|000001
+|7|000002
+|8|000003
+|9|000004
+|10|000005
+|===
+Since TOOL_EVENT_ID is first attribute of row key and table region boundaries are also having start-end keys as 6 digit sequential prefixes, this ensures that each mapper generates (nearly) same number of rows for each region, making the data uniformly distributed.
+TOOL_EVENT_ID suffix and other columns of the row are populated with random data.
+
+Number of rows generated is
+----
+rows-per-mapper * mapper-count
+----
+
+Size of each rows is (approximately)
+----
+850 B
+----
+
+=== Experiments
+These results are from a 11 node cluster having HBase and Hadoop service running within self-managed test environment
+[options="header"]
+|===
+|Data Size|Time to Generate Data (mins)
+//----------------------
+|100 GB|6 minutes
+|340 GB|13 minutes
+|3.5 TB|3 hours 10 minutes
+|===
+
+
+:numbered:
+
+ifdef::backend-docbook[]
+[index]
+== Index
+// Generated automatically by the DocBook toolchain.
+endif::backend-docbook[]