You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by zh...@apache.org on 2020/08/14 07:03:27 UTC
[kudu] branch master updated: KUDU-3180: prioritize larger mem-stores in time-based flusing

This is an automated email from the ASF dual-hosted git repository.

zhangyifan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 42aff29  KUDU-3180: prioritize larger mem-stores in time-based flusing
42aff29 is described below

commit 42aff29360bf5be0a141902dac707e865933473f
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Mon Aug 10 19:39:21 2020 +0800

    KUDU-3180: prioritize larger mem-stores in time-based flusing
    
    Current time-based flush policy will always pick a mem-store
    that haven't been flushed in a long time instead of a mem-store
    anchoring more memory, this may lead to:
    - more memory used by mem-stores.
    - more small rowsets on disk so we need to do more compaction.
    
    This patch improve current flush policy by considering both
    mem-stores' size and time since last flush. When a mem-store
    become large or old enough, it will be more likely to flush,
    then we can avoid anchoring large (but below the threshold)
    mem-stores or WALs for too long.
    
    Change-Id: I0a826643709a4990e40b0a49f89f4ea34f14163b
    Reviewed-on: http://gerrit.cloudera.org:8080/16319
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/tablet/tablet_replica-test.cc   | 22 ++++++++++++-----
 src/kudu/tablet/tablet_replica_mm_ops.cc | 42 ++++++++++++++++----------------
 2 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc
index ecd4b1a..e6f31ef 100644
--- a/src/kudu/tablet/tablet_replica-test.cc
+++ b/src/kudu/tablet/tablet_replica-test.cc
@@ -517,16 +517,18 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) {
 
   MaintenanceOpStats stats;
 
-  // Just on the threshold and not enough time has passed for a time-based flush.
+  // Just on the threshold and not enough time has passed for a time-based flush,
+  // we'll expect improvement equal to '1'.
   stats.set_ram_anchored(64 * 1024 * 1024);
   FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 1);
-  ASSERT_EQ(0.0, stats.perf_improvement());
+  ASSERT_EQ(1.0, stats.perf_improvement());
   stats.Clear();
 
-  // Just on the threshold and enough time has passed, we'll have a low improvement.
-  stats.set_ram_anchored(64 * 1024 * 1024);
+  // Below the threshold and enough time has passed, we'll have a low improvement.
+  stats.set_ram_anchored(2 * 1024 * 1024);
   FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000);
-  ASSERT_GT(stats.perf_improvement(), 0.01);
+  ASSERT_LT(0.01, stats.perf_improvement());
+  ASSERT_GT(0.1, stats.perf_improvement());
   stats.Clear();
 
   // Over the threshold, we expect improvement equal to the excess MB.
@@ -536,11 +538,19 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) {
   stats.Clear();
 
   // Below the threshold but have been there a long time, closing in to 1.0.
-  stats.set_ram_anchored(30 * 1024 * 1024);
+  stats.set_ram_anchored(1);
   FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 60 * 50 * 1000);
   ASSERT_LT(0.7, stats.perf_improvement());
   ASSERT_GT(1.0, stats.perf_improvement());
   stats.Clear();
+
+  // Approaching threshold, enough time has passed but haven't been there a long time,
+  // closing in to 1.0.
+  stats.set_ram_anchored(63 * 1024 * 1024);
+  FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000);
+  ASSERT_LT(0.9, stats.perf_improvement());
+  ASSERT_GT(1.0, stats.perf_improvement());
+  stats.Clear();
 }
 
 // Test that the schema of a tablet will be rolled forward upon replaying an
diff --git a/src/kudu/tablet/tablet_replica_mm_ops.cc b/src/kudu/tablet/tablet_replica_mm_ops.cc
index 61243a7..ea53453 100644
--- a/src/kudu/tablet/tablet_replica_mm_ops.cc
+++ b/src/kudu/tablet/tablet_replica_mm_ops.cc
@@ -17,11 +17,11 @@
 
 #include "kudu/tablet/tablet_replica_mm_ops.h"
 
+#include <algorithm>
 #include <map>
 #include <mutex>
 #include <ostream>
 #include <string>
-#include <utility>
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
@@ -30,7 +30,6 @@
 #include "kudu/common/common.pb.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/port.h"
-#include "kudu/gutil/strings/substitute.h"
 #include "kudu/tablet/tablet_metadata.h"
 #include "kudu/tablet/tablet_metrics.h"
 #include "kudu/util/flag_tags.h"
@@ -63,18 +62,25 @@ TAG_FLAG(enable_log_gc, runtime);
 TAG_FLAG(enable_log_gc, unsafe);
 
 DEFINE_int32(flush_threshold_mb, 1024,
-             "Size at which MemRowSet flushes are triggered. "
+             "Size at which MRS/DMS flushes are triggered. "
              "A MRS can still flush below this threshold if it hasn't flushed in a while, "
              "or if the server-wide memory limit has been reached.");
 TAG_FLAG(flush_threshold_mb, experimental);
 TAG_FLAG(flush_threshold_mb, runtime);
 
 DEFINE_int32(flush_threshold_secs, 2 * 60,
-             "Number of seconds after which a non-empty MemRowSet will become flushable "
+             "Number of seconds after which a non-empty MRS/DMS will become flushable "
              "even if it is not large.");
 TAG_FLAG(flush_threshold_secs, experimental);
 TAG_FLAG(flush_threshold_secs, runtime);
 
+DEFINE_int32(flush_upper_bound_ms, 60 * 60 * 1000,
+             "Number of milliseconds after which the time-based performance improvement "
+             "score of a non-empty MRS/DMS flush op will reach its maximum value. "
+             "The score may further increase as the MRS/DMS grows in size.");
+TAG_FLAG(flush_upper_bound_ms, experimental);
+TAG_FLAG(flush_upper_bound_ms, runtime);
+
 DECLARE_bool(enable_workload_score_for_perf_improvement_ops);
 
 METRIC_DEFINE_gauge_uint32(tablet, log_gc_running,
@@ -93,10 +99,6 @@ namespace kudu {
 namespace tablet {
 
 using std::map;
-using strings::Substitute;
-
-// Upper bound for how long it takes to reach "full perf improvement" in time-based flushing.
-const double kFlushUpperBoundMs = 60 * 60 * 1000;
 
 //
 // FlushOpPerfImprovementPolicy.
@@ -106,27 +108,25 @@ void FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(MaintenanceOpStats
                                                               double elapsed_ms) {
   double anchored_mb = static_cast<double>(stats->ram_anchored()) / (1024 * 1024);
   const double threshold_mb = FLAGS_flush_threshold_mb;
-  if (anchored_mb > threshold_mb) {
+  const double upper_bound_ms = FLAGS_flush_upper_bound_ms;
+  if (anchored_mb >= threshold_mb) {
     // If we're over the user-specified flush threshold, then consider the perf
-    // improvement to be 1 for every extra MB.  This produces perf_improvement results
-    // which are much higher than most compactions would produce, and means that, when
-    // there is an MRS over threshold, a flush will almost always be selected instead of
-    // a compaction.  That's not necessarily a good thing, but in the absence of better
+    // improvement to be 1 for every extra MB (at least 1). This produces perf_improvement
+    // results which are much higher than most compactions would produce, and means that,
+    // when there is an MRS over threshold, a flush will almost always be selected instead of
+    // a compaction. That's not necessarily a good thing, but in the absence of better
     // heuristics, it will do for now.
     double extra_mb = anchored_mb - threshold_mb;
     DCHECK_GE(extra_mb, 0);
-    stats->set_perf_improvement(extra_mb);
+    stats->set_perf_improvement(std::max(1.0, extra_mb));
   } else if (elapsed_ms > FLAGS_flush_threshold_secs * 1000) {
     // Even if we aren't over the threshold, consider flushing if we haven't flushed
     // in a long time. But, don't give it a large perf_improvement score. We should
     // only do this if we really don't have much else to do, and if we've already waited a bit.
-    // The following will give an improvement that's between 0.0 and 1.0, gradually growing
-    // as 'elapsed_ms' approaches 'kFlushUpperBoundMs'.
-    double perf = elapsed_ms / kFlushUpperBoundMs;
-    if (perf > 1.0) {
-      perf = 1.0;
-    }
-    stats->set_perf_improvement(perf);
+    // The following will give an improvement that's between 0.0 and 1.0, gradually growing as
+    // 'elapsed_ms' approaches 'upper_bound_ms' or 'anchored_mb' approaches 'threshold_mb'.
+    double perf = std::max(elapsed_ms / upper_bound_ms, anchored_mb / threshold_mb);
+    stats->set_perf_improvement(std::min(1.0, perf));
   }
 }