You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2018/01/13 01:00:18 UTC

kudu git commit: KUDU-2253 Deltafile on-disk size is 3x larger than expected

Repository: kudu
Updated Branches:
  refs/heads/master 26fd153b0 -> 8058825ba


KUDU-2253 Deltafile on-disk size is 3x larger than expected

While looking into the performance of the integration test written for
KUDU-2251 (https://gerrit.cloudera.org/#/c/8951/ revision 6), Todd and I
found that the on-disk deltafiles written are about 3x larger than
expected. The culprit is an optimization in the CFile value index which
is turned off for delta files. The optimization truncates large keys
after the first unique byte between sequential values. The deltafile
values, in the case of this integration test, include the small
DeltaKey, and the 8KiB updated value. As a result the BTree interior
nodes are being completely filled by only ~4 values (32KiB cblock size
by default). This makes the BTree far less effective, and means that the
full updated data is written many times. We expect fixing this will
improve performance for update-heavy workloads with large values (for
example, YCSB).

Unfortunately, fixing the issue is not quite as simple as enabling the
optimization for deltafiles, since in the normal course of seeking
through deltafiles during a scan, we deserialze the value index keys
into a DeltaKey. If the values are truncated this deserialization step
can fail. Instead, this patch adds overridable value index key encoding
to CFileWriter, and delta file overrides it to only encode the delta
key, which is a pair of variable-length integers.

Change-Id: I4cea3371fcf57f89fe10a3b9262bc152023cb04c
Reviewed-on: http://gerrit.cloudera.org:8080/8982
Tested-by: Kudu Jenkins
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/8058825b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/8058825b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/8058825b

Branch: refs/heads/master
Commit: 8058825ba8cd02897e0a1b592d74bdfe6e2a3f62
Parents: 26fd153
Author: Dan Burkert <da...@apache.org>
Authored: Wed Jan 10 08:50:58 2018 -0800
Committer: Dan Burkert <da...@apache.org>
Committed: Sat Jan 13 01:00:01 2018 +0000

----------------------------------------------------------------------
 src/kudu/cfile/bloomfile.cc            |  5 +++-
 src/kudu/cfile/cfile_util.cc           | 10 ++++++++
 src/kudu/cfile/cfile_util.h            | 10 ++++++++
 src/kudu/cfile/cfile_writer.cc         | 39 +++++++++++++----------------
 src/kudu/cfile/cfile_writer.h          |  7 ++----
 src/kudu/tablet/deltafile.cc           | 32 +++++++++++++++++++----
 src/kudu/tablet/diskrowset.cc          |  2 +-
 src/kudu/tablet/multi_column_writer.cc |  2 +-
 8 files changed, 72 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/cfile/bloomfile.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bloomfile.cc b/src/kudu/cfile/bloomfile.cc
index 2f13525..b3628ee 100644
--- a/src/kudu/cfile/bloomfile.cc
+++ b/src/kudu/cfile/bloomfile.cc
@@ -104,7 +104,10 @@ BloomFileWriter::BloomFileWriter(unique_ptr<WritableBlock> block,
   // bloom filters are high-entropy data structures by their nature.
   opts.storage_attributes.encoding  = PLAIN_ENCODING;
   opts.storage_attributes.compression = NO_COMPRESSION;
-  writer_.reset(new cfile::CFileWriter(opts, GetTypeInfo(BINARY), false, std::move(block)));
+  writer_.reset(new cfile::CFileWriter(std::move(opts),
+                                       GetTypeInfo(BINARY),
+                                       false,
+                                       std::move(block)));
 }
 
 Status BloomFileWriter::Start() {

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/cfile/cfile_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_util.cc b/src/kudu/cfile/cfile_util.cc
index 600970a..3bdb9a7 100644
--- a/src/kudu/cfile/cfile_util.cc
+++ b/src/kudu/cfile/cfile_util.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <string>
 
+#include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 
 #include "kudu/cfile/cfile_reader.h"
@@ -40,6 +41,15 @@ using std::string;
 
 static const int kBufSize = 1024*1024;
 
+WriterOptions::WriterOptions()
+  : index_block_size(32*1024),
+    block_restart_interval(16),
+    write_posidx(false),
+    write_validx(false),
+    optimize_index_keys(true),
+    validx_key_encoder(boost::none) {
+}
+
 Status DumpIterator(const CFileReader& reader,
                     CFileIterator* it,
                     std::ostream* out,

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/cfile/cfile_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_util.h b/src/kudu/cfile/cfile_util.h
index 56f03ee..967569e 100644
--- a/src/kudu/cfile/cfile_util.h
+++ b/src/kudu/cfile/cfile_util.h
@@ -18,9 +18,12 @@
 #define CFILE_UTIL_H_
 
 #include <cstddef>
+#include <functional>
 #include <iostream>
 #include <memory>
 
+#include <boost/optional/optional.hpp>
+
 #include "kudu/common/schema.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
@@ -28,6 +31,7 @@
 namespace kudu {
 
 class MemTracker;
+class faststring;
 
 namespace cfile {
 
@@ -44,6 +48,8 @@ enum IncompatibleFeatures {
   SUPPORTED = NONE | CHECKSUM
 };
 
+typedef std::function<void(const void*, faststring*)> ValidxKeyEncoder;
+
 struct WriterOptions {
   // Approximate size of index blocks.
   //
@@ -75,6 +81,10 @@ struct WriterOptions {
   // schema.h
   ColumnStorageAttributes storage_attributes;
 
+  // An optional value index key encoder. If not set, the default encoder
+  // encodes the entire value.
+  boost::optional<ValidxKeyEncoder> validx_key_encoder;
+
   WriterOptions();
 };
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/cfile/cfile_writer.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_writer.cc b/src/kudu/cfile/cfile_writer.cc
index 496317b..c5373f5 100644
--- a/src/kudu/cfile/cfile_writer.cc
+++ b/src/kudu/cfile/cfile_writer.cc
@@ -17,10 +17,12 @@
 
 #include "kudu/cfile/cfile_writer.h"
 
+#include <functional>
 #include <numeric>
 #include <ostream>
 #include <utility>
 
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
@@ -83,33 +85,20 @@ static CompressionType GetDefaultCompressionCodec() {
 }
 
 ////////////////////////////////////////////////////////////
-// Options
-////////////////////////////////////////////////////////////
-WriterOptions::WriterOptions()
-  : index_block_size(32*1024),
-    block_restart_interval(16),
-    write_posidx(false),
-    write_validx(false),
-    optimize_index_keys(true) {
-}
-
-
-////////////////////////////////////////////////////////////
 // CFileWriter
 ////////////////////////////////////////////////////////////
 
 
-CFileWriter::CFileWriter(const WriterOptions &options,
+CFileWriter::CFileWriter(WriterOptions options,
                          const TypeInfo* typeinfo,
                          bool is_nullable,
                          unique_ptr<WritableBlock> block)
   : block_(std::move(block)),
     off_(0),
     value_count_(0),
-    options_(options),
+    options_(std::move(options)),
     is_nullable_(is_nullable),
     typeinfo_(typeinfo),
-    key_encoder_(nullptr),
     state_(kWriterInitialized) {
   EncodingType encoding = options_.storage_attributes.encoding;
   Status s = TypeEncodingInfo::Get(typeinfo_, encoding, &type_encoding_info_);
@@ -138,12 +127,18 @@ CFileWriter::CFileWriter(const WriterOptions &options,
     options_.storage_attributes.cfile_block_size = kMinBlockSize;
   }
 
-  if (options.write_posidx) {
+  if (options_.write_posidx) {
     posidx_builder_.reset(new IndexTreeBuilder(&options_, this));
   }
 
-  if (options.write_validx) {
-    key_encoder_ = &GetKeyEncoder<faststring>(typeinfo_);
+  if (options_.write_validx) {
+    if (!options_.validx_key_encoder) {
+      auto key_encoder = &GetKeyEncoder<faststring>(typeinfo_);
+      options_.validx_key_encoder = [key_encoder] (const void* value, faststring* buffer) {
+        key_encoder->ResetAndEncode(value, buffer);
+      };
+    }
+
     validx_builder_.reset(new IndexTreeBuilder(&options_, this));
   }
 }
@@ -415,17 +410,17 @@ Status CFileWriter::FinishCurDataBlock() {
 
   if (validx_builder_ != nullptr) {
     RETURN_NOT_OK(data_block_->GetLastKey(key_tmp_space));
-    key_encoder_->ResetAndEncode(key_tmp_space, &last_key_);
+    (*options_.validx_key_encoder)(key_tmp_space, &last_key_);
   }
   data_block_->Reset();
 
   return s;
 }
 
-Status CFileWriter::AppendRawBlock(const vector<Slice> &data_slices,
+Status CFileWriter::AppendRawBlock(const vector<Slice>& data_slices,
                                    size_t ordinal_pos,
                                    const void *validx_curr,
-                                   const Slice &validx_prev,
+                                   const Slice& validx_prev,
                                    const char *name_for_log) {
   CHECK_EQ(state_, kWriterWriting);
 
@@ -447,7 +442,7 @@ Status CFileWriter::AppendRawBlock(const vector<Slice> &data_slices,
     CHECK(validx_curr != nullptr) <<
       "must pass a key for raw block if validx is configured";
 
-    key_encoder_->ResetAndEncode(validx_curr, &tmp_buf_);
+    (*options_.validx_key_encoder)(validx_curr, &tmp_buf_);
     Slice idx_key = Slice(tmp_buf_);
     if (options_.optimize_index_keys) {
       GetSeparatingKey(validx_prev, &idx_key);

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/cfile/cfile_writer.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_writer.h b/src/kudu/cfile/cfile_writer.h
index d23ecef..191a073 100644
--- a/src/kudu/cfile/cfile_writer.h
+++ b/src/kudu/cfile/cfile_writer.h
@@ -103,10 +103,11 @@ class NullBitmapBuilder {
 // Main class used to write a CFile.
 class CFileWriter {
  public:
-  explicit CFileWriter(const WriterOptions &options,
+  explicit CFileWriter(WriterOptions options,
                        const TypeInfo* typeinfo,
                        bool is_nullable,
                        std::unique_ptr<fs::WritableBlock> block);
+
   ~CFileWriter();
 
   Status Start();
@@ -222,10 +223,6 @@ class CFileWriter {
   const TypeInfo* typeinfo_;
   const TypeEncodingInfo* type_encoding_info_;
 
-  // The key-encoder. Only set if the writer is writing an embedded
-  // value index.
-  const KeyEncoder<faststring>* key_encoder_;
-
   // The last key written to the block.
   // Only set if the writer is writing an embedded value index.
   faststring last_key_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/tablet/deltafile.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/deltafile.cc b/src/kudu/tablet/deltafile.cc
index 60ddedb..1d773f9 100644
--- a/src/kudu/tablet/deltafile.cc
+++ b/src/kudu/tablet/deltafile.cc
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
 #include <gflags/gflags_declare.h>
 
@@ -101,14 +102,35 @@ DeltaFileWriter::DeltaFileWriter(unique_ptr<WritableBlock> block)
   opts.write_validx = true;
   opts.storage_attributes.cfile_block_size = FLAGS_deltafile_default_block_size;
   opts.storage_attributes.encoding = PLAIN_ENCODING;
-  opts.storage_attributes.compression = GetCompressionCodecType(
-      FLAGS_deltafile_default_compression_codec);
-  // No optimization for deltafiles because a deltafile index key must decode into a DeltaKey
+  opts.storage_attributes.compression =
+      GetCompressionCodecType(FLAGS_deltafile_default_compression_codec);
+
+  // The CFile value index is 'compressed' by truncating delta values to only
+  // contain the delta key. The entire deltakey is required in order to support
+  // efficient seeks without deserializing the entire cblock. The generic value
+  // index optimization is disabled, since it could truncate portions of the
+  // deltakey.
+  //
+  // Note: The deltafile usage of the CFile value index is irregular, since it
+  // inserts values in non-sorted order (the timestamp portion of the deltakey
+  // in UNDO files is sorted in descending order). This doesn't appear to cause
+  // problems in practice.
   opts.optimize_index_keys = false;
-  writer_.reset(new cfile::CFileWriter(opts, GetTypeInfo(BINARY), false, std::move(block)));
+  opts.validx_key_encoder = [] (const void* value, faststring* buffer) {
+    buffer->clear();
+    const Slice* s1 = static_cast<const Slice*>(value);
+    Slice s2(*s1);
+    DeltaKey key;
+    CHECK_OK(key.DecodeFrom(&s2));
+    key.EncodeTo(buffer);
+  };
+
+  writer_.reset(new cfile::CFileWriter(std::move(opts),
+                                       GetTypeInfo(BINARY),
+                                       false,
+                                       std::move(block)));
 }
 
-
 Status DeltaFileWriter::Start() {
   return writer_->Start();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/tablet/diskrowset.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index 63909a2..37aede9 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -173,7 +173,7 @@ Status DiskRowSetWriter::InitAdHocIndexWriter() {
 
   // Create the CFile writer for the ad-hoc index.
   ad_hoc_index_writer_.reset(new cfile::CFileWriter(
-      opts,
+      std::move(opts),
       GetTypeInfo(BINARY),
       false,
       std::move(block)));

http://git-wip-us.apache.org/repos/asf/kudu/blob/8058825b/src/kudu/tablet/multi_column_writer.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/multi_column_writer.cc b/src/kudu/tablet/multi_column_writer.cc
index 15f70b5..126b63f 100644
--- a/src/kudu/tablet/multi_column_writer.cc
+++ b/src/kudu/tablet/multi_column_writer.cc
@@ -90,7 +90,7 @@ Status MultiColumnWriter::Open() {
 
     // Create the CFile writer itself.
     gscoped_ptr<CFileWriter> writer(new CFileWriter(
-        opts,
+        std::move(opts),
         col.type_info(),
         col.is_nullable(),
         std::move(block)));