You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2023/01/17 16:12:42 UTC

[kudu] branch master updated: KUDU-3406 corrected estimate for ancient UNDO delta size

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new ef2a5c39d KUDU-3406 corrected estimate for ancient UNDO delta size
ef2a5c39d is described below

commit ef2a5c39dff75736c11ae51a9b86ba943796e873
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Tue Jan 10 15:32:08 2023 -0800

    KUDU-3406 corrected estimate for ancient UNDO delta size
    
    When looking into micro-benchmark results produced by the
    $KUDU_HOME/src/kudu/scripts/benchmarks.sh script, I noticed that
    dense_node-itest showed 9-fold increase in the number of blocks under
    management.  Even if the test disables GC of the ancient UNDO deltas
    (i.e. it runs with --enable_undo_delta_block_gc=false), that's not the
    expected behavior.  It turned out the issue was in the way how
    DeltaTracker::EstimateBytesInPotentiallyAncientUndoDeltas() operated: it
    always treated a delta to be ancient if no stats were present.  So, if a
    delta file was lazily loaded without stats being read, DeltaTracker
    assumed all its deltas were ancient.  With the new behavior introduced
    in 1556a353e, it led to rowset merge compactions skipping the newly
    generated UNDO deltas since the estimate reported 100% of those deltas
    being ancient.
    
    While this was not detected by prior testing using various real-world
    scenarios involving some tangible amount of data written, tracking
    the history of stats emitted by dense_node-itest allowed to spot the
    issue.
    
    This patch addresses the issue, introducing different estimate modes for
    the method mentioned above and using proper modes in various contexts.
    I verified that with this patch added, the benchmark based on
    dense_node-itest now reports the number of blocks as it has been
    reporting for the longest span of its history.  So I'm not adding any
    new tests with this patch.
    
    This is a follow-up to 1556a353e60c5d555996347cbd46d5e5a6661266.
    
    Change-Id: I17bddae86f84792caf14fb1e11a6e1c0d7a92b56
    Reviewed-on: http://gerrit.cloudera.org:8080/19413
    Tested-by: Kudu Jenkins
    Reviewed-by: Attila Bukor <ab...@apache.org>
---
 src/kudu/tablet/delta_tracker.cc | 21 +++++++++++++++------
 src/kudu/tablet/delta_tracker.h  |  9 +++++----
 src/kudu/tablet/diskrowset.cc    |  9 ++++++---
 src/kudu/tablet/diskrowset.h     |  6 ++++--
 src/kudu/tablet/memrowset.h      |  6 ++++--
 src/kudu/tablet/mock-rowsets.h   |  1 +
 src/kudu/tablet/rowset.h         | 27 +++++++++++++++++++++------
 src/kudu/tablet/tablet.cc        | 23 +++++++++++++++++------
 8 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/src/kudu/tablet/delta_tracker.cc b/src/kudu/tablet/delta_tracker.cc
index 7f836db95..166eebe4c 100644
--- a/src/kudu/tablet/delta_tracker.cc
+++ b/src/kudu/tablet/delta_tracker.cc
@@ -505,7 +505,9 @@ bool DeltaTracker::EstimateAllRedosAreAncient(Timestamp ancient_history_mark) {
 }
 
 Status DeltaTracker::EstimateBytesInPotentiallyAncientUndoDeltas(
-    Timestamp ancient_history_mark, int64_t* bytes) const {
+    Timestamp ancient_history_mark,
+    RowSet::EstimateType estimate_type,
+    int64_t* bytes) const {
   DCHECK_NE(Timestamp::kInvalidTimestamp, ancient_history_mark);
   DCHECK(bytes);
   SharedDeltaStoreVector undos_newest_first;
@@ -513,12 +515,19 @@ Status DeltaTracker::EstimateBytesInPotentiallyAncientUndoDeltas(
 
   int64_t tmp_bytes = 0;
   for (const auto& undo : boost::adaptors::reverse(undos_newest_first)) {
-    // Short-circuit once we hit an initialized delta block with 'max_timestamp' > AHM.
-    if (undo->has_delta_stats() &&
-        undo->delta_stats().max_timestamp() >= ancient_history_mark) {
-      break;
+    if (undo->has_delta_stats()) {
+      if (undo->delta_stats().max_timestamp() >= ancient_history_mark) {
+        // Short-circuit once we hit an initialized delta block with
+        // 'max_timestamp' > AHM.
+        break;
+      }
+      tmp_bytes += undo->EstimateSize();
+    } else if (estimate_type == RowSet::OVERESTIMATE) {
+      // If delta stats are absent, add up the estimate to the total only when
+      // an overestimate is requested. There is no certainty whether the delta
+      // is actually ancient since no information is available in this context.
+      tmp_bytes += undo->EstimateSize();  // can be called without delta stats
     }
-    tmp_bytes += undo->EstimateSize(); // Can be called before Init().
   }
 
   *bytes = tmp_bytes;
diff --git a/src/kudu/tablet/delta_tracker.h b/src/kudu/tablet/delta_tracker.h
index 7feadcc64..17717d1f4 100644
--- a/src/kudu/tablet/delta_tracker.h
+++ b/src/kudu/tablet/delta_tracker.h
@@ -33,6 +33,7 @@
 #include "kudu/tablet/delta_key.h"
 #include "kudu/tablet/delta_stats.h"
 #include "kudu/tablet/delta_store.h"
+#include "kudu/tablet/rowset.h"
 #include "kudu/tablet/tablet_mem_trackers.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/mutex.h"
@@ -67,8 +68,6 @@ class DeltaMemStore;
 class OperationResultPB;
 class RowSetMetadata;
 class RowSetMetadataUpdate;
-struct ProbeStats;
-struct RowIteratorOptions;
 
 typedef std::pair<BlockId, std::unique_ptr<DeltaStats>> DeltaBlockIdAndStats;
 
@@ -176,8 +175,10 @@ class DeltaTracker {
   Status CompactStores(const fs::IOContext* io_context, int start_idx, int end_idx);
 
   // See RowSet::EstimateBytesInPotentiallyAncientUndoDeltas().
-  Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp ancient_history_mark,
-                                                     int64_t* bytes) const;
+  Status EstimateBytesInPotentiallyAncientUndoDeltas(
+      Timestamp ancient_history_mark,
+      RowSet::EstimateType estimate_type,
+      int64_t* bytes) const;
 
   // Returns whether all redo (DMS and newest redo delta file) are ancient
   // (i.e. that the redo with the highest timestamp is older than the AHM).
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index c9d03a2a9..cfbaf27f1 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -866,9 +866,12 @@ double DiskRowSet::DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType
   return std::min(1.0, perf_improv);
 }
 
-Status DiskRowSet::EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp ancient_history_mark,
-                                                               int64_t* bytes) {
-  return delta_tracker_->EstimateBytesInPotentiallyAncientUndoDeltas(ancient_history_mark, bytes);
+Status DiskRowSet::EstimateBytesInPotentiallyAncientUndoDeltas(
+    Timestamp ancient_history_mark,
+    EstimateType estimate_type,
+    int64_t* bytes) {
+  return delta_tracker_->EstimateBytesInPotentiallyAncientUndoDeltas(
+      ancient_history_mark, estimate_type, bytes);
 }
 
 Status DiskRowSet::IsDeletedAndFullyAncient(Timestamp ancient_history_mark,
diff --git a/src/kudu/tablet/diskrowset.h b/src/kudu/tablet/diskrowset.h
index 228311bb9..ce7184822 100644
--- a/src/kudu/tablet/diskrowset.h
+++ b/src/kudu/tablet/diskrowset.h
@@ -411,8 +411,10 @@ class DiskRowSet :
 
   double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const override;
 
-  Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp ancient_history_mark,
-                                                     int64_t* bytes) override;
+  Status EstimateBytesInPotentiallyAncientUndoDeltas(
+      Timestamp ancient_history_mark,
+      EstimateType estimate_type,
+      int64_t* bytes) override;
 
   Status IsDeletedAndFullyAncient(Timestamp ancient_history_mark,
                                   bool* deleted_and_ancient) override;
diff --git a/src/kudu/tablet/memrowset.h b/src/kudu/tablet/memrowset.h
index 291ca41d8..1aaa66fe8 100644
--- a/src/kudu/tablet/memrowset.h
+++ b/src/kudu/tablet/memrowset.h
@@ -412,8 +412,10 @@ class MemRowSet : public RowSet,
     return 0;
   }
 
-  Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp /*ancient_history_mark*/,
-                                                     int64_t* bytes) override {
+  Status EstimateBytesInPotentiallyAncientUndoDeltas(
+      Timestamp /*ancient_history_mark*/,
+      EstimateType /*estimate_type*/,
+      int64_t* bytes) override {
     DCHECK(bytes);
     *bytes = 0;
     return Status::OK();
diff --git a/src/kudu/tablet/mock-rowsets.h b/src/kudu/tablet/mock-rowsets.h
index 701d2f8db..eedeb95f9 100644
--- a/src/kudu/tablet/mock-rowsets.h
+++ b/src/kudu/tablet/mock-rowsets.h
@@ -154,6 +154,7 @@ class MockRowSet : public RowSet {
   }
 
   Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp /*ancient_history_mark*/,
+                                                     EstimateType /*estimate_type*/,
                                                      int64_t* /*bytes*/) override {
     LOG(FATAL) << "Unimplemented";
     return Status::OK();
diff --git a/src/kudu/tablet/rowset.h b/src/kudu/tablet/rowset.h
index e04f8a686..26db0d73b 100644
--- a/src/kudu/tablet/rowset.h
+++ b/src/kudu/tablet/rowset.h
@@ -110,6 +110,16 @@ class RowSet {
     MINOR_DELTA_COMPACTION
   };
 
+  // Estimate types used in EstimateBytesInPotentiallyAncientUndoDeltas().
+  // The information on the age of a delta might be absent in some contexts,
+  // and such a delta is treated differently depending on the requested estimate
+  // type: it's considered as ancient when an overestimate is appropriate,
+  // and not so when an underestimate is desired.
+  enum EstimateType {
+    OVERESTIMATE,
+    UNDERESTIMATE,
+  };
+
   // Check if a given row key is present in this rowset.
   // Sets *present and returns Status::OK, unless an error
   // occurs.
@@ -231,10 +241,13 @@ class RowSet {
                                           bool* deleted_and_ancient) = 0;
 
   // Estimate the number of bytes in ancient undo delta stores. This may be an
-  // overestimate. The argument 'ancient_history_mark' must be valid (it may
-  // not be equal to Timestamp::kInvalidTimestamp).
-  virtual Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp ancient_history_mark,
-                                                             int64_t* bytes) = 0;
+  // overestimate or an underestimate depending on 'estimate_type,. The argument
+  // 'ancient_history_mark' must be valid: it must not be equal to
+  // Timestamp::kInvalidTimestamp.
+  virtual Status EstimateBytesInPotentiallyAncientUndoDeltas(
+      Timestamp ancient_history_mark,
+      EstimateType estimate_type,
+      int64_t* bytes) = 0;
 
   // Initialize undo delta blocks until the given 'deadline' is passed, or
   // until all undo delta blocks with a max timestamp older than
@@ -476,8 +489,10 @@ class DuplicatingRowSet : public RowSet {
     return Status::OK();
   }
 
-  Status EstimateBytesInPotentiallyAncientUndoDeltas(Timestamp /*ancient_history_mark*/,
-                                                     int64_t* bytes) override {
+  Status EstimateBytesInPotentiallyAncientUndoDeltas(
+      Timestamp /*ancient_history_mark*/,
+      EstimateType /*estimate_type*/,
+      int64_t* bytes) override {
     DCHECK(bytes);
     *bytes = 0;
     return Status::OK();
diff --git a/src/kudu/tablet/tablet.cc b/src/kudu/tablet/tablet.cc
index d33c653c5..ef6b77cee 100644
--- a/src/kudu/tablet/tablet.cc
+++ b/src/kudu/tablet/tablet.cc
@@ -2315,12 +2315,18 @@ void Tablet::UpdateCompactionStats(MaintenanceOpStats* stats) {
     const auto* drs = down_cast<const DiskRowSet*>(rs);
     const auto& dt = drs->delta_tracker();
 
+    // Using RowSet::UNDERESTIMATE as the estimate type in this context: it's
+    // necessary to be 100% sure the delta is ancient to apply the criterion
+    // based on the --rowset_compaction_ancient_delta_max_ratio threshold.
+    // Otherwise, the rowset merge compaction task might skip over rowsets that
+    // contain mostly recent deltas until they indeed become ancient and GC-ed
+    // by the UNDO delta GC maintenance task.
     int64_t size = 0;
     {
       Timestamp ancient_history_mark;
       if (Tablet::GetTabletAncientHistoryMark(&ancient_history_mark)) {
         WARN_NOT_OK(dt.EstimateBytesInPotentiallyAncientUndoDeltas(
-            ancient_history_mark, &size),
+            ancient_history_mark, RowSet::UNDERESTIMATE, &size),
             "could not estimate size of ancient UNDO deltas");
       }
     }
@@ -2841,8 +2847,11 @@ Status Tablet::EstimateBytesInPotentiallyAncientUndoDeltas(int64_t* bytes) {
   int64_t tablet_bytes = 0;
   for (const auto& rowset : comps->rowsets->all_rowsets()) {
     int64_t rowset_bytes;
-    RETURN_NOT_OK(rowset->EstimateBytesInPotentiallyAncientUndoDeltas(ancient_history_mark,
-                                                                      &rowset_bytes));
+    // Since the estimate is for "potentially ancient" deltas, the deltas
+    // with no information on their age should be included into the result:
+    // Row::OVERESTIMATE provides the desired type of estimate in this case.
+    RETURN_NOT_OK(rowset->EstimateBytesInPotentiallyAncientUndoDeltas(
+        ancient_history_mark, RowSet::OVERESTIMATE, &rowset_bytes));
     tablet_bytes += rowset_bytes;
   }
 
@@ -2868,14 +2877,16 @@ Status Tablet::InitAncientUndoDeltas(MonoDelta time_budget, int64_t* bytes_in_an
   RowSetVector rowsets = comps->rowsets->all_rowsets();
 
   // Estimate the size of the ancient undos in each rowset so that we can
-  // initialize them greedily.
+  // initialize them greedily. Using the RowSet::OVERESTIMATE estimate type here
+  // as in Tablet::EstimateBytesInPotentiallyAncientUndoDeltas() above
+  // for the same reason.
   vector<pair<size_t, int64_t>> rowset_ancient_undos_est_sizes; // index, bytes
   rowset_ancient_undos_est_sizes.reserve(rowsets.size());
   for (size_t i = 0; i < rowsets.size(); i++) {
     const auto& rowset = rowsets[i];
     int64_t bytes;
-    RETURN_NOT_OK(rowset->EstimateBytesInPotentiallyAncientUndoDeltas(ancient_history_mark,
-                                                                      &bytes));
+    RETURN_NOT_OK(rowset->EstimateBytesInPotentiallyAncientUndoDeltas(
+      ancient_history_mark, RowSet::OVERESTIMATE, &bytes));
     rowset_ancient_undos_est_sizes.emplace_back(i, bytes);
   }