You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2017/12/22 20:08:53 UTC

kudu git commit: KUDU-2231: sparse column predicate can cause excessive data-block reads

Repository: kudu
Updated Branches:
  refs/heads/branch-1.5.x af119b68d -> 31807acf8


KUDU-2231: sparse column predicate can cause excessive data-block reads

When scanning with a sparsely-matching predicate, the CFileIterator can
repeatedly materialize non-predicate column blocks multiple times. The
result is huge amounts of CPU wasted in block decoding and poor
performance.

The root cause is that CFileIterator::SeekToOrdinal does not check
whether the currently materialized data block contains the ordinal index
being seeked to. Instead, it throws away the currently prepared blocks
(in CFileIterator::PrepareForNewSeek), and re-materializes the blocks
again.

This commit is a very targeted fix. Since I've had some time to get
familiar with this codepath in the past few days, I've found some things
that I think we could improve and simplify in follow-up commits, which
I've filed as KUDU-2243.

Change-Id: I8eb3be4a809f882ccd80c48612099b2071306ff7
Reviewed-on: http://gerrit.cloudera.org:8080/8869
Tested-by: Kudu Jenkins
Reviewed-by: Dan Burkert <da...@apache.org>
(cherry picked from commit 36ecb300901daf4006fae0c9be5657f2f6127233)
Reviewed-on: http://gerrit.cloudera.org:8080/8903
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/31807acf
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/31807acf
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/31807acf

Branch: refs/heads/branch-1.5.x
Commit: 31807acf80e8b9d92761592b04fbeaa463d7a4fc
Parents: af119b6
Author: Dan Burkert <da...@apache.org>
Authored: Fri Dec 15 17:03:25 2017 -0800
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Dec 22 20:08:31 2017 +0000

----------------------------------------------------------------------
 src/kudu/cfile/cfile_reader.cc          | 62 ++++++++++++++++---------
 src/kudu/tablet/tablet-pushdown-test.cc | 67 +++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/31807acf/src/kudu/cfile/cfile_reader.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/cfile_reader.cc b/src/kudu/cfile/cfile_reader.cc
index 6311b75..e1346d3 100644
--- a/src/kudu/cfile/cfile_reader.cc
+++ b/src/kudu/cfile/cfile_reader.cc
@@ -665,37 +665,57 @@ CFileIterator::~CFileIterator() {
 }
 
 Status CFileIterator::SeekToOrdinal(rowid_t ord_idx) {
-  RETURN_NOT_OK(PrepareForNewSeek());
-  if (PREDICT_FALSE(posidx_iter_ == nullptr)) {
-    return Status::NotSupported("no positional index in file");
-  }
+  // Check to see if we already have the required block prepared. Typically
+  // (when seeking forward during a scan), only the final block might be
+  // suitable, since all previous blocks will have been scanned in the prior
+  // batch. Only reusing the final block also ensures posidx_iter_ is already
+  // seeked to the correct block.
+  if (!prepared_blocks_.empty() &&
+      prepared_blocks_.back()->last_row_idx() >= ord_idx &&
+      prepared_blocks_.back()->first_row_idx() <= ord_idx) {
+    PreparedBlock* b = prepared_blocks_.back();
+    prepared_blocks_.pop_back();
+    for (PreparedBlock* pb : prepared_blocks_) {
+      prepared_block_pool_.Destroy(pb);
+    }
+    prepared_blocks_.clear();
 
-  tmp_buf_.clear();
-  KeyEncoderTraits<UINT32, faststring>::Encode(ord_idx, &tmp_buf_);
-  RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(Slice(tmp_buf_)));
+    prepared_blocks_.push_back(b);
+  } else {
+    // Otherwise, prepare a new cblock to scan.
+    RETURN_NOT_OK(PrepareForNewSeek());
+    if (PREDICT_FALSE(posidx_iter_ == nullptr)) {
+      return Status::NotSupported("no positional index in file");
+    }
 
-  // TODO: fast seek within block (without reseeking index)
-  pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr(
-    prepared_block_pool_.Construct());
-  RETURN_NOT_OK(ReadCurrentDataBlock(*posidx_iter_, b.get()));
-
-  // If the data block doesn't actually contain the data
-  // we're looking for, then we're probably in the last
-  // block in the file.
-  // TODO: could assert that each of the index layers is
-  // at its last entry (ie HasNext() is false for each)
-  if (PREDICT_FALSE(ord_idx > b->last_row_idx())) {
-    return Status::NotFound("trying to seek past highest ordinal in file");
+    tmp_buf_.clear();
+    KeyEncoderTraits<UINT32, faststring>::Encode(ord_idx, &tmp_buf_);
+    RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(Slice(tmp_buf_)));
+
+    pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr(
+      prepared_block_pool_.Construct());
+    RETURN_NOT_OK(ReadCurrentDataBlock(*posidx_iter_, b.get()));
+
+    // If the data block doesn't actually contain the data
+    // we're looking for, then we're probably in the last
+    // block in the file.
+    // TODO(unknown): could assert that each of the index layers is
+    // at its last entry (ie HasNext() is false for each)
+    if (PREDICT_FALSE(ord_idx > b->last_row_idx())) {
+      return Status::NotFound("trying to seek past highest ordinal in file");
+    }
+    prepared_blocks_.push_back(b.release());
   }
 
+  PreparedBlock* b = prepared_blocks_.back();
+
   // Seek data block to correct index
   DCHECK(ord_idx >= b->first_row_idx() &&
          ord_idx <= b->last_row_idx())
     << "got wrong data block. looking for ord_idx=" << ord_idx
     << " but got dblk " << b->ToString();
-         SeekToPositionInBlock(b.get(), ord_idx - b->first_row_idx());
+  SeekToPositionInBlock(b, ord_idx - b->first_row_idx());
 
-  prepared_blocks_.push_back(b.release());
   last_prepare_idx_ = ord_idx;
   last_prepare_count_ = 0;
   seeked_ = posidx_iter_.get();

http://git-wip-us.apache.org/repos/asf/kudu/blob/31807acf/src/kudu/tablet/tablet-pushdown-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet-pushdown-test.cc b/src/kudu/tablet/tablet-pushdown-test.cc
index cabc2a3..0389928 100644
--- a/src/kudu/tablet/tablet-pushdown-test.cc
+++ b/src/kudu/tablet/tablet-pushdown-test.cc
@@ -35,7 +35,6 @@
 #include "kudu/common/scan_spec.h"
 #include "kudu/common/schema.h"
 #include "kudu/gutil/gscoped_ptr.h"
-#include "kudu/gutil/port.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/tablet/local_tablet_writer.h"
 #include "kudu/tablet/tablet-test-util.h"
@@ -68,7 +67,7 @@ class TabletPushdownTest : public KuduTabletTest,
                               ColumnSchema("string_val", STRING) }, 1)) {
   }
 
-  virtual void SetUp() OVERRIDE {
+  void SetUp() override {
     KuduTabletTest::SetUp();
 
     FillTestTablet();
@@ -226,5 +225,69 @@ INSTANTIATE_TEST_CASE_P(AllMemory, TabletPushdownTest, ::testing::Values(ALL_IN_
 INSTANTIATE_TEST_CASE_P(SplitMemoryDisk, TabletPushdownTest, ::testing::Values(SPLIT_MEMORY_DISK));
 INSTANTIATE_TEST_CASE_P(AllDisk, TabletPushdownTest, ::testing::Values(ALL_ON_DISK));
 
+class TabletSparsePushdownTest : public KuduTabletTest {
+ public:
+  TabletSparsePushdownTest()
+      : KuduTabletTest(Schema({ ColumnSchema("key", INT32),
+                                ColumnSchema("val", INT32, true) },
+                              1)) {
+  }
+
+  void SetUp() override {
+    KuduTabletTest::SetUp();
+
+    RowBuilder rb(client_schema_);
+
+    LocalTabletWriter writer(tablet().get(), &client_schema_);
+    KuduPartialRow row(&client_schema_);
+
+    // FLAGS_scanner_batch_size_rows * 2
+    int kWidth = 100 * 2;
+    for (int i = 0; i < kWidth * 2; i++) {
+      CHECK_OK(row.SetInt32(0, i));
+      if (i % 3 == 0) {
+        CHECK_OK(row.SetNull(1));
+      } else {
+        CHECK_OK(row.SetInt32(1, i % kWidth));
+      }
+      ASSERT_OK_FAST(writer.Insert(row));
+    }
+    ASSERT_OK(tablet()->Flush());
+  }
+};
+
+// The is a regression test for KUDU-2231, which fixed a CFileReader bug causing
+// cblocks to be repeatedly materialized when a scan had predicates on other
+// columns which caused sections of a column to be skipped. The setup for this
+// test creates a dataset and predicate which only match every-other batch of
+// 100 rows of the 'val' column.
+TEST_F(TabletSparsePushdownTest, Kudu2231) {
+  ScanSpec spec;
+  int32_t value = 50;
+  spec.AddPredicate(ColumnPredicate::Equality(schema_.column(1), &value));
+
+  gscoped_ptr<RowwiseIterator> iter;
+  ASSERT_OK(tablet()->NewRowIterator(client_schema_, &iter));
+  ASSERT_OK(iter->Init(&spec));
+  ASSERT_TRUE(spec.predicates().empty()) << "Should have accepted all predicates";
+
+  vector<string> results;
+  ASSERT_OK(IterateToStringList(iter.get(), &results));
+
+  EXPECT_EQ(results, vector<string>({
+      "(int32 key=50, int32 val=50)",
+      "(int32 key=250, int32 val=50)",
+  }));
+
+  vector<IteratorStats> stats;
+  iter->GetIteratorStats(&stats);
+
+  EXPECT_EQ(1, stats[0].data_blocks_read_from_disk);
+  EXPECT_EQ(1, stats[1].data_blocks_read_from_disk);
+
+  EXPECT_EQ(400, stats[0].cells_read_from_disk);
+  EXPECT_EQ(400, stats[1].cells_read_from_disk);
+}
+
 } // namespace tablet
 } // namespace kudu