You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by re...@apache.org on 2018/08/10 03:16:18 UTC
hbase git commit: HBASE-18201 add UT and docs for
DataBlockEncodingTool
Repository: hbase
Updated Branches:
refs/heads/master c6ff1de7e -> 28635d610
HBASE-18201 add UT and docs for DataBlockEncodingTool
Signed-off-by: Reid Chan <re...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/28635d61
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/28635d61
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/28635d61
Branch: refs/heads/master
Commit: 28635d6101956d51580a503270528d44ebbf8612
Parents: c6ff1de
Author: brandboat <br...@gmail.com>
Authored: Tue Aug 7 12:52:05 2018 +0800
Committer: Reid Chan <re...@apache.org>
Committed: Fri Aug 10 11:15:50 2018 +0800
----------------------------------------------------------------------
.../hbase/io/encoding/EncodedDataBlock.java | 41 +++++-
.../regionserver/DataBlockEncodingTool.java | 64 +++++---
.../regionserver/TestDataBlockEncodingTool.java | 147 +++++++++++++++++++
src/main/asciidoc/_chapters/ops_mgt.adoc | 24 +++
4 files changed, 251 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/28635d61/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
----------------------------------------------------------------------
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
index af68656..3c16cff 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/EncodedDataBlock.java
@@ -23,7 +23,9 @@ import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
+import java.util.ArrayList;
import java.util.Iterator;
+import java.util.List;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.hadoop.hbase.Cell;
@@ -57,6 +59,16 @@ public class EncodedDataBlock {
private final HFileBlockEncodingContext encodingCtx;
private HFileContext meta;
+ private final DataBlockEncoding encoding;
+
+ // The is for one situation that there are some cells includes tags and others are not.
+ // isTagsLenZero stores if cell tags length is zero before doing encoding since we need
+ // to check cell tags length is zero or not after decoding.
+ // Encoders ROW_INDEX_V1 would abandon tags segment if tags is 0 after decode cells to
+ // byte array, other encoders won't do that. So we have to find a way to add tagsLen zero
+ // in the decoded byte array.
+ private List<Boolean> isTagsLenZero = new ArrayList<>();
+
/**
* Create a buffer which will be encoded using dataBlockEncoder.
* @param dataBlockEncoder Algorithm used for compression.
@@ -69,6 +81,7 @@ public class EncodedDataBlock {
Preconditions.checkNotNull(encoding,
"Cannot create encoded data block with null encoder");
this.dataBlockEncoder = dataBlockEncoder;
+ this.encoding = encoding;
encodingCtx = dataBlockEncoder.newDataBlockEncodingContext(encoding,
HConstants.HFILEBLOCK_DUMMY_HEADER, meta);
this.rawKVs = rawKVs;
@@ -90,6 +103,7 @@ public class EncodedDataBlock {
return new Iterator<Cell>() {
private ByteBuffer decompressedData = null;
+ private Iterator<Boolean> it = isTagsLenZero.iterator();
@Override
public boolean hasNext() {
@@ -116,10 +130,18 @@ public class EncodedDataBlock {
int vlen = decompressedData.getInt();
int tagsLen = 0;
ByteBufferUtils.skip(decompressedData, klen + vlen);
- // Read the tag length in case when steam contain tags
+ // Read the tag length in case when stream contain tags
if (meta.isIncludesTags()) {
- tagsLen = ((decompressedData.get() & 0xff) << 8) ^ (decompressedData.get() & 0xff);
- ByteBufferUtils.skip(decompressedData, tagsLen);
+ boolean noTags = true;
+ if (it.hasNext()) {
+ noTags = it.next();
+ }
+ // ROW_INDEX_V1 will not put tagsLen back in cell if it is zero, there is no need
+ // to read short here.
+ if (!(encoding.equals(DataBlockEncoding.ROW_INDEX_V1) && noTags)) {
+ tagsLen = ((decompressedData.get() & 0xff) << 8) ^ (decompressedData.get() & 0xff);
+ ByteBufferUtils.skip(decompressedData, tagsLen);
+ }
}
KeyValue kv = new KeyValue(decompressedData.array(), offset,
(int) KeyValue.getKeyValueDataStructureSize(klen, vlen, tagsLen));
@@ -247,6 +269,7 @@ public class EncodedDataBlock {
if (this.meta.isIncludesTags()) {
tagsLength = ((in.get() & 0xff) << 8) ^ (in.get() & 0xff);
ByteBufferUtils.skip(in, tagsLength);
+ this.isTagsLenZero.add(tagsLength == 0);
}
if (this.meta.isIncludesMvcc()) {
memstoreTS = ByteBufferUtils.readVLong(in);
@@ -260,6 +283,16 @@ public class EncodedDataBlock {
baos.flush();
baosBytes = baos.toByteArray();
this.dataBlockEncoder.endBlockEncoding(encodingCtx, out, baosBytes);
+ // In endBlockEncoding(encodingCtx, out, baosBytes), Encoder ROW_INDEX_V1 write integer in
+ // out while the others write integer in baosBytes(byte array). We need to add
+ // baos.toByteArray() after endBlockEncoding again to make sure the integer writes in
+ // outputstream with Encoder ROW_INDEX_V1 dump to byte array (baosBytes).
+ // The if branch is necessary because Encoders excepts ROW_INDEX_V1 write integer in
+ // baosBytes directly, without if branch and do toByteArray() again, baosBytes won't
+ // contains the integer wrotten in endBlockEncoding.
+ if (this.encoding.equals(DataBlockEncoding.ROW_INDEX_V1)) {
+ baosBytes = baos.toByteArray();
+ }
} catch (IOException e) {
throw new RuntimeException(String.format(
"Bug in encoding part of algorithm %s. " +
@@ -271,6 +304,6 @@ public class EncodedDataBlock {
@Override
public String toString() {
- return dataBlockEncoder.toString();
+ return encoding.name();
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/28635d61/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
index 91435f3..59f6678 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/DataBlockEncodingTool.java
@@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.regionserver;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.DecimalFormat;
@@ -44,6 +45,7 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.hbase.io.hfile.HFileReaderImpl;
import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
@@ -91,8 +93,8 @@ public class DataBlockEncodingTool {
/** If this is specified, no correctness testing will be done */
private static final String OPT_OMIT_CORRECTNESS_TEST = "c";
- /** What encoding algorithm to test */
- private static final String OPT_ENCODING_ALGORITHM = "a";
+ /** What compression algorithm to test */
+ private static final String OPT_COMPRESSION_ALGORITHM = "a";
/** Number of times to run each benchmark */
private static final String OPT_BENCHMARK_N_TIMES = "t";
@@ -132,7 +134,10 @@ public class DataBlockEncodingTool {
private final Compressor compressor;
private final Decompressor decompressor;
- private static enum Manipulation {
+ // Check if HFile use Tag.
+ private static boolean USE_TAG = false;
+
+ private enum Manipulation {
ENCODING,
DECODING,
COMPRESSION,
@@ -192,8 +197,25 @@ public class DataBlockEncodingTool {
}
}
- uncompressedOutputStream.write(currentKV.getBuffer(),
- currentKV.getOffset(), currentKV.getLength());
+ // Add tagsLen zero to cells don't include tags. Since the process of
+ // scanner converts byte array to KV would abandon tagsLen part if tagsLen
+ // is zero. But we still needs the tagsLen part to check if current cell
+ // include tags. If USE_TAG is true, HFile contains cells with tags,
+ // if the cell tagsLen equals 0, it means other cells may have tags.
+ if (USE_TAG && currentKV.getTagsLength() == 0) {
+ uncompressedOutputStream.write(currentKV.getBuffer(),
+ currentKV.getOffset(), currentKV.getLength());
+ // write tagsLen = 0.
+ uncompressedOutputStream.write(Bytes.toBytes((short) 0));
+ } else {
+ uncompressedOutputStream.write(currentKV.getBuffer(),
+ currentKV.getOffset(), currentKV.getLength());
+ }
+
+ if(includesMemstoreTS) {
+ WritableUtils.writeVLong(
+ new DataOutputStream(uncompressedOutputStream), currentKV.getSequenceId());
+ }
previousKey = currentKey;
@@ -209,16 +231,16 @@ public class DataBlockEncodingTool {
}
rawKVs = uncompressedOutputStream.toByteArray();
- boolean useTag = (currentKV.getTagsLength() > 0);
for (DataBlockEncoding encoding : encodings) {
if (encoding == DataBlockEncoding.NONE) {
continue;
}
DataBlockEncoder d = encoding.getEncoder();
HFileContext meta = new HFileContextBuilder()
- .withCompression(Compression.Algorithm.NONE)
- .withIncludesMvcc(includesMemstoreTS)
- .withIncludesTags(useTag).build();
+ .withDataBlockEncoding(encoding)
+ .withCompression(Compression.Algorithm.NONE)
+ .withIncludesMvcc(includesMemstoreTS)
+ .withIncludesTags(USE_TAG).build();
codecs.add(new EncodedDataBlock(d, encoding, rawKVs, meta ));
}
}
@@ -396,8 +418,10 @@ public class DataBlockEncodingTool {
try {
for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
final long startTime = System.nanoTime();
- compressingStream.resetState();
+ // The compressedStream should reset before compressingStream resetState since in GZ
+ // resetStatue will write header in the outputstream.
compressedStream.reset();
+ compressingStream.resetState();
compressingStream.write(buffer, offset, length);
compressingStream.flush();
compressedStream.toByteArray();
@@ -438,12 +462,6 @@ public class DataBlockEncodingTool {
}
decompressedStream.close();
- // iterate over KeyValues
- KeyValue kv;
- for (int pos = 0; pos < length; pos += kv.getLength()) {
- kv = new KeyValue(newBuf, pos);
- }
-
} catch (IOException e) {
throw new RuntimeException(String.format(
"Decoding path in '%s' algorithm cause exception ", name), e);
@@ -596,8 +614,9 @@ public class DataBlockEncodingTool {
hsf.initReader();
StoreFileReader reader = hsf.getReader();
reader.loadFileInfo();
- KeyValueScanner scanner = reader.getStoreFileScanner(true, true, false, 0, 0, false);
-
+ KeyValueScanner scanner = reader.getStoreFileScanner(true, true,
+ false, hsf.getMaxMemStoreTS(), 0, false);
+ USE_TAG = reader.getHFileReader().getFileContext().isIncludesTags();
// run the utilities
DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName);
int majorVersion = reader.getHFileVersion();
@@ -654,7 +673,7 @@ public class DataBlockEncodingTool {
"Measure read throughput");
options.addOption(OPT_OMIT_CORRECTNESS_TEST, false,
"Omit corectness tests.");
- options.addOption(OPT_ENCODING_ALGORITHM, true,
+ options.addOption(OPT_COMPRESSION_ALGORITHM, true,
"What kind of compression algorithm use for comparison.");
options.addOption(OPT_BENCHMARK_N_TIMES,
true, "Number of times to run each benchmark. Default value: " +
@@ -680,6 +699,9 @@ public class DataBlockEncodingTool {
int kvLimit = Integer.MAX_VALUE;
if (cmd.hasOption(OPT_KV_LIMIT)) {
kvLimit = Integer.parseInt(cmd.getOptionValue(OPT_KV_LIMIT));
+ if (kvLimit <= 0) {
+ LOG.error("KV_LIMIT should not less than 1.");
+ }
}
// basic argument sanity checks
@@ -692,9 +714,9 @@ public class DataBlockEncodingTool {
String pathName = cmd.getOptionValue(OPT_HFILE_NAME);
String compressionName = DEFAULT_COMPRESSION.getName();
- if (cmd.hasOption(OPT_ENCODING_ALGORITHM)) {
+ if (cmd.hasOption(OPT_COMPRESSION_ALGORITHM)) {
compressionName =
- cmd.getOptionValue(OPT_ENCODING_ALGORITHM).toLowerCase(Locale.ROOT);
+ cmd.getOptionValue(OPT_COMPRESSION_ALGORITHM).toLowerCase(Locale.ROOT);
}
boolean doBenchmark = cmd.hasOption(OPT_MEASURE_THROUGHPUT);
boolean doVerify = !cmd.hasOption(OPT_OMIT_CORRECTNESS_TEST);
http://git-wip-us.apache.org/repos/asf/hbase/blob/28635d61/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java
new file mode 100644
index 0000000..7a6b3d2
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestDataBlockEncodingTool.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.ArrayBackedTag;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.Tag;
+import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.io.hfile.HFileContext;
+import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Test DataBlockEncodingTool.
+ */
+@Category({MiscTests.class, SmallTests.class})
+public class TestDataBlockEncodingTool {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestDataBlockEncodingTool.class);
+
+ private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+ private static final String ROOT_DIR =
+ TEST_UTIL.getDataTestDir("TestDataBlockEncodingTool").toString();
+ private static final Configuration conf = TEST_UTIL.getConfiguration();
+ private static FileSystem fs;
+ private static StoreFileWriter sfw;
+
+ @Before
+ public void setUp() throws IOException {
+ fs = TEST_UTIL.getTestFileSystem();
+ }
+
+ private void testHFile(String fileName, boolean useTags, boolean allTags) throws IOException {
+ Path path = new Path(ROOT_DIR, fileName);
+ try {
+ createHFileWithTags(path, useTags, allTags);
+ testDataBlockingTool(path);
+ } finally {
+ if (fs.exists(path)) {
+ fs.delete(path, false);
+ }
+ }
+ }
+
+ private void createHFileWithTags(Path path, boolean useTags, boolean allTags) throws IOException {
+ HFileContext meta = new HFileContextBuilder()
+ .withBlockSize(64 * 1024)
+ .withIncludesTags(useTags).build();
+ sfw =
+ new StoreFileWriter.Builder(conf, fs)
+ .withFilePath(path)
+ .withFileContext(meta).build();
+ long now = System.currentTimeMillis();
+ byte[] FAMILY = Bytes.toBytes("cf");
+ byte[] QUALIFIER = Bytes.toBytes("q");
+ try {
+ for (char d = 'a'; d <= 'z'; d++) {
+ for (char e = 'a'; e <= 'z'; e++) {
+ byte[] b = new byte[] { (byte) d, (byte) e };
+ KeyValue kv;
+ if (useTags) {
+ if (allTags) {
+ // Write cells with tags to HFile.
+ Tag[] tags = new Tag[]{
+ new ArrayBackedTag((byte) 0, Bytes.toString(b)),
+ new ArrayBackedTag((byte) 0, Bytes.toString(b))};
+ kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags);
+ } else {
+ // Write half cells with tags and half without tags to HFile.
+ if ((e - 'a') % 2 == 0) {
+ kv = new KeyValue(b, FAMILY, QUALIFIER, now, b);
+ } else {
+ Tag[] tags = new Tag[]{
+ new ArrayBackedTag((byte) 0, Bytes.toString(b)),
+ new ArrayBackedTag((byte) 0, Bytes.toString(b))};
+ kv = new KeyValue(b, FAMILY, QUALIFIER, now, b, tags);
+ }
+ }
+ } else {
+ // Write cells without tags to HFile.
+ kv = new KeyValue(b, FAMILY, QUALIFIER, now, b);
+ }
+ sfw.append(kv);
+ }
+ }
+ sfw.appendMetadata(0, false);
+ } finally {
+ sfw.close();
+ }
+ }
+
+ private static void testDataBlockingTool(Path path) throws IOException {
+ Configuration conf = HBaseConfiguration.create();
+ int maxKV = Integer.MAX_VALUE;
+ boolean doVerify = true;
+ boolean doBenchmark = true;
+ String testHFilePath = path.toString();
+ DataBlockEncodingTool.testCodecs(conf, maxKV, testHFilePath,
+ Compression.Algorithm.GZ.getName(), doBenchmark, doVerify);
+ }
+
+ @Test
+ public void testHFileAllCellsWithTags() throws IOException {
+ testHFile("1234567890", true, true);
+ }
+
+ @Test
+ public void testHFileAllCellsWithoutTags() throws IOException {
+ testHFile("1234567089", false, false);
+ }
+
+ @Test
+ public void testHFileHalfCellsWithTags() throws IOException {
+ testHFile("1234560789", true, false);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hbase/blob/28635d61/src/main/asciidoc/_chapters/ops_mgt.adoc
----------------------------------------------------------------------
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index f7ae38e..443d895 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -1005,6 +1005,30 @@ drop_namespace 'pre_upgrade_cleanup'
For further information, please refer to
link:https://issues.apache.org/jira/browse/HBASE-20649?focusedCommentId=16535476#comment-16535476[HBASE-20649].
+=== Data Block Encoding Tool
+
+Tests various compression algorithms with different data block encoder for key compression on an existing HFile.
+Useful for testing, debugging and benchmarking.
+
+You must specify `-f` which is the full path of the HFile.
+
+The result shows both the performance (MB/s) of compression/decompression and encoding/decoding, and the data savings on the HFile.
+
+----
+
+$ bin/hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool
+Usages: hbase org.apache.hadoop.hbase.regionserver.DataBlockEncodingTool
+Options:
+ -f HFile to analyse (REQUIRED)
+ -n Maximum number of key/value pairs to process in a single benchmark run.
+ -b Whether to run a benchmark to measure read throughput.
+ -c If this is specified, no correctness testing will be done.
+ -a What kind of compression algorithm use for test. Default value: GZ.
+ -t Number of times to run each benchmark. Default value: 12.
+ -omit Number of first runs of every benchmark to omit from statistics. Default value: 2.
+
+----
+
[[ops.regionmgt]]
== Region Management