You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2019/02/20 04:25:48 UTC
[kudu] branch master updated: KUDU-2701 Fix compaction loop due to
using wrong rowset size
This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 3e3bd1c KUDU-2701 Fix compaction loop due to using wrong rowset size
3e3bd1c is described below
commit 3e3bd1ccbc2b4b070c733b36b1971de63977428b
Author: Will Berkeley <wd...@gmail.org>
AuthorDate: Thu Feb 14 14:00:06 2019 -0800
KUDU-2701 Fix compaction loop due to using wrong rowset size
Compaction policy originally evaluated the size of a rowset to be the
size of the rowset's base data and its REDOs. This size is used to
calculate the probability mass of the rowset and the weight of the
rowset in the compaction knapsack problem. Mistakenly, it was also used
as the size of a rowset for KUDU-1400 small rowset compaction policy.
This is wrong- the size of the whole rowset should be used. The reason
for this is the following: small rowset compaction compacts
rowsets when the number of rowsets is reduced. The number of rowsets
produced depends on the total size of the data when written. If a
partial size of the rowset is used, this can lead to bad decisions being
made about compaction. For example, I discovered a case where a tablet
had 8 rowsets of "size" 16MB. In reality, this size was the base data
size plus REDOs. Small rowset compaction policy determined that it would
be good to compact these 8 rowsets, believing that it would produce 4
rowsets of size 32MB. However, the rowsets were actually 32MB in size,
and compacting them produced 8 rowsets of size 32MB identical to the
previous 8, and therefore 8 rowsets that appeared to be of size 16MB to
compaction policy. Thus these 8 were chosen to be compacted, and so
on...
This patch changes the small rowset compaction policy code to use the
full size of the rowsets. All other uses of size remain the same. It's
not totally clear to me why just base data and REDOs are used, but in
any case changing it would change CDFs and change knapsack calculations,
which could lead to pretty big differences in compaction behavior.
Since, outside this bug, compaction is working fine, and since this
patch is targeted to unblock 1.9, I opted to make a minimal change to
fix the bug and not evaluate any larger chance to compaction policy.
I tested this code on the cluster where I originally encountered the
compaction loop. The loop did not reoccur and the updated compaction
policy successfully compacted small rowsets together for several hours
without becoming stuck. It finally converged to a stable end state. I
also ran a number of sequential and random insert workloads on this
cluster and saw no problems with compactions looping or getting stuck,
even under high load or when insertions ended and the tablet was free to
catch up on compactions.
Change-Id: I21b7ff6333137aaf1e98ef4849691dd08e24e007
Reviewed-on: http://gerrit.cloudera.org:8080/12488
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
src/kudu/tablet/compaction_policy.cc | 8 ++++----
src/kudu/tablet/rowset_info.cc | 22 +++++++++++++---------
src/kudu/tablet/rowset_info.h | 15 +++++++++++----
3 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/src/kudu/tablet/compaction_policy.cc b/src/kudu/tablet/compaction_policy.cc
index 77af79b..e740ac4 100644
--- a/src/kudu/tablet/compaction_policy.cc
+++ b/src/kudu/tablet/compaction_policy.cc
@@ -124,7 +124,7 @@ struct KnapsackTraits {
typedef const RowSetInfo* item_type;
typedef double value_type;
static int get_weight(item_type item) {
- return item->size_mb();
+ return item->base_and_redos_size_mb();
}
static value_type get_value(item_type item) {
return item->value();
@@ -179,11 +179,11 @@ class BoundCalculator {
std::push_heap(fractional_solution_.begin(),
fractional_solution_.end(),
compareByDescendingDensity);
- current_weight_ += candidate.size_mb();
+ current_weight_ += candidate.base_and_redos_size_mb();
current_value_ += candidate.value();
const RowSetInfo* top = fractional_solution_.front();
- while (current_weight_ - top->size_mb() > max_weight_) {
- current_weight_ -= top->size_mb();
+ while (current_weight_ - top->base_and_redos_size_mb() > max_weight_) {
+ current_weight_ -= top->base_and_redos_size_mb();
current_value_ -= top->value();
std::pop_heap(fractional_solution_.begin(),
fractional_solution_.end(),
diff --git a/src/kudu/tablet/rowset_info.cc b/src/kudu/tablet/rowset_info.cc
index 7adb953..5089a1d 100644
--- a/src/kudu/tablet/rowset_info.cc
+++ b/src/kudu/tablet/rowset_info.cc
@@ -191,7 +191,7 @@ double WidthByDataSize(const Slice& prev, const Slice& next,
for (const auto& rs_rsi : active) {
double fraction = StringFractionInRange(rs_rsi.second, prev, next);
- weight += rs_rsi.second->size_bytes() * fraction;
+ weight += rs_rsi.second->base_and_redos_size_bytes() * fraction;
}
return weight;
@@ -449,9 +449,12 @@ RowSetInfo::RowSetInfo(RowSet* rs, double init_cdf)
cdf_max_key_(init_cdf),
extra_(new ExtraData()) {
extra_->rowset = rs;
- extra_->size_bytes = rs->OnDiskBaseDataSizeWithRedos();
+ extra_->base_and_redos_size_bytes = rs->OnDiskBaseDataSizeWithRedos();
+ extra_->size_bytes = rs->OnDiskSize();
extra_->has_bounds = rs->GetBounds(&extra_->min_key, &extra_->max_key).ok();
- size_mb_ = std::max(implicit_cast<int>(extra_->size_bytes / 1024 / 1024), kMinSizeMb);
+ base_and_redos_size_mb_ =
+ std::max(implicit_cast<int>(extra_->base_and_redos_size_bytes / 1024 / 1024),
+ kMinSizeMb);
}
uint64_t RowSetInfo::size_bytes(const ColumnId& col_id) const {
@@ -461,21 +464,22 @@ uint64_t RowSetInfo::size_bytes(const ColumnId& col_id) const {
void RowSetInfo::FinalizeCDFVector(double quot, vector<RowSetInfo>* vec) {
if (quot == 0) return;
for (RowSetInfo& cdf_rs : *vec) {
- CHECK_GT(cdf_rs.size_mb_, 0) << "Expected file size to be at least 1MB "
- << "for RowSet " << cdf_rs.rowset()->ToString()
- << ", was " << cdf_rs.size_bytes()
- << " bytes.";
+ CHECK_GT(cdf_rs.base_and_redos_size_mb_, 0)
+ << "Expected file size to be at least 1MB "
+ << "for RowSet " << cdf_rs.rowset()->ToString()
+ << ", was " << cdf_rs.base_and_redos_size_bytes()
+ << " bytes.";
cdf_rs.cdf_min_key_ /= quot;
cdf_rs.cdf_max_key_ /= quot;
cdf_rs.value_ = ComputeRowsetValue(cdf_rs.width(), cdf_rs.extra_->size_bytes);
- cdf_rs.density_ = cdf_rs.value_ / cdf_rs.size_mb_;
+ cdf_rs.density_ = cdf_rs.value_ / cdf_rs.base_and_redos_size_mb_;
}
}
string RowSetInfo::ToString() const {
string ret;
ret.append(rowset()->ToString());
- StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", size_mb_,
+ StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", base_and_redos_size_mb_,
cdf_min_key_, cdf_max_key_);
if (extra_->has_bounds) {
ret.append(" [").append(KUDU_REDACT(Slice(extra_->min_key).ToDebugString()));
diff --git a/src/kudu/tablet/rowset_info.h b/src/kudu/tablet/rowset_info.h
index a8290d3..af2d07d 100644
--- a/src/kudu/tablet/rowset_info.h
+++ b/src/kudu/tablet/rowset_info.h
@@ -69,8 +69,11 @@ class RowSetInfo {
std::vector<KeyRange>* ranges);
uint64_t size_bytes(const ColumnId& col_id) const;
+ uint64_t base_and_redos_size_bytes() const {
+ return extra_->base_and_redos_size_bytes;
+ }
uint64_t size_bytes() const { return extra_->size_bytes; }
- int size_mb() const { return size_mb_; }
+ int base_and_redos_size_mb() const { return base_and_redos_size_mb_; }
// Return the value of the CDF at the minimum key of this candidate.
double cdf_min_key() const { return cdf_min_key_; }
@@ -113,9 +116,10 @@ class RowSetInfo {
static void FinalizeCDFVector(double quot, std::vector<RowSetInfo>* vec);
- // The size in MB, already clamped so that all rowsets have size at least
- // 1MB. This is cached to avoid the branch during the selection hot path.
- int size_mb_;
+ // The size of the base data and redos in MB, already clamped so that all
+ // rowsets have size at least 1MB. This is cached to avoid the branch during
+ // the selection hot path.
+ int base_and_redos_size_mb_;
double cdf_min_key_, cdf_max_key_;
@@ -139,6 +143,9 @@ class RowSetInfo {
// These are ref-counted so that RowSetInfo is copyable.
struct ExtraData : public RefCounted<ExtraData> {
// Cached version of rowset_->OnDiskBaseDataSizeWithRedos().
+ uint64_t base_and_redos_size_bytes;
+
+ // Cached version of rowset_->OnDiskSize().
uint64_t size_bytes;
// True if the RowSet has known bounds.