You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by zh...@apache.org on 2020/08/14 07:03:27 UTC
[kudu] branch master updated: KUDU-3180: prioritize larger
mem-stores in time-based flusing
This is an automated email from the ASF dual-hosted git repository.
zhangyifan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 42aff29 KUDU-3180: prioritize larger mem-stores in time-based flusing
42aff29 is described below
commit 42aff29360bf5be0a141902dac707e865933473f
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Mon Aug 10 19:39:21 2020 +0800
KUDU-3180: prioritize larger mem-stores in time-based flusing
Current time-based flush policy will always pick a mem-store
that haven't been flushed in a long time instead of a mem-store
anchoring more memory, this may lead to:
- more memory used by mem-stores.
- more small rowsets on disk so we need to do more compaction.
This patch improve current flush policy by considering both
mem-stores' size and time since last flush. When a mem-store
become large or old enough, it will be more likely to flush,
then we can avoid anchoring large (but below the threshold)
mem-stores or WALs for too long.
Change-Id: I0a826643709a4990e40b0a49f89f4ea34f14163b
Reviewed-on: http://gerrit.cloudera.org:8080/16319
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
src/kudu/tablet/tablet_replica-test.cc | 22 ++++++++++++-----
src/kudu/tablet/tablet_replica_mm_ops.cc | 42 ++++++++++++++++----------------
2 files changed, 37 insertions(+), 27 deletions(-)
diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc
index ecd4b1a..e6f31ef 100644
--- a/src/kudu/tablet/tablet_replica-test.cc
+++ b/src/kudu/tablet/tablet_replica-test.cc
@@ -517,16 +517,18 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) {
MaintenanceOpStats stats;
- // Just on the threshold and not enough time has passed for a time-based flush.
+ // Just on the threshold and not enough time has passed for a time-based flush,
+ // we'll expect improvement equal to '1'.
stats.set_ram_anchored(64 * 1024 * 1024);
FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 1);
- ASSERT_EQ(0.0, stats.perf_improvement());
+ ASSERT_EQ(1.0, stats.perf_improvement());
stats.Clear();
- // Just on the threshold and enough time has passed, we'll have a low improvement.
- stats.set_ram_anchored(64 * 1024 * 1024);
+ // Below the threshold and enough time has passed, we'll have a low improvement.
+ stats.set_ram_anchored(2 * 1024 * 1024);
FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000);
- ASSERT_GT(stats.perf_improvement(), 0.01);
+ ASSERT_LT(0.01, stats.perf_improvement());
+ ASSERT_GT(0.1, stats.perf_improvement());
stats.Clear();
// Over the threshold, we expect improvement equal to the excess MB.
@@ -536,11 +538,19 @@ TEST_F(TabletReplicaTest, TestFlushOpsPerfImprovements) {
stats.Clear();
// Below the threshold but have been there a long time, closing in to 1.0.
- stats.set_ram_anchored(30 * 1024 * 1024);
+ stats.set_ram_anchored(1);
FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 60 * 50 * 1000);
ASSERT_LT(0.7, stats.perf_improvement());
ASSERT_GT(1.0, stats.perf_improvement());
stats.Clear();
+
+ // Approaching threshold, enough time has passed but haven't been there a long time,
+ // closing in to 1.0.
+ stats.set_ram_anchored(63 * 1024 * 1024);
+ FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000);
+ ASSERT_LT(0.9, stats.perf_improvement());
+ ASSERT_GT(1.0, stats.perf_improvement());
+ stats.Clear();
}
// Test that the schema of a tablet will be rolled forward upon replaying an
diff --git a/src/kudu/tablet/tablet_replica_mm_ops.cc b/src/kudu/tablet/tablet_replica_mm_ops.cc
index 61243a7..ea53453 100644
--- a/src/kudu/tablet/tablet_replica_mm_ops.cc
+++ b/src/kudu/tablet/tablet_replica_mm_ops.cc
@@ -17,11 +17,11 @@
#include "kudu/tablet/tablet_replica_mm_ops.h"
+#include <algorithm>
#include <map>
#include <mutex>
#include <ostream>
#include <string>
-#include <utility>
#include <boost/optional/optional.hpp>
#include <gflags/gflags.h>
@@ -30,7 +30,6 @@
#include "kudu/common/common.pb.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
-#include "kudu/gutil/strings/substitute.h"
#include "kudu/tablet/tablet_metadata.h"
#include "kudu/tablet/tablet_metrics.h"
#include "kudu/util/flag_tags.h"
@@ -63,18 +62,25 @@ TAG_FLAG(enable_log_gc, runtime);
TAG_FLAG(enable_log_gc, unsafe);
DEFINE_int32(flush_threshold_mb, 1024,
- "Size at which MemRowSet flushes are triggered. "
+ "Size at which MRS/DMS flushes are triggered. "
"A MRS can still flush below this threshold if it hasn't flushed in a while, "
"or if the server-wide memory limit has been reached.");
TAG_FLAG(flush_threshold_mb, experimental);
TAG_FLAG(flush_threshold_mb, runtime);
DEFINE_int32(flush_threshold_secs, 2 * 60,
- "Number of seconds after which a non-empty MemRowSet will become flushable "
+ "Number of seconds after which a non-empty MRS/DMS will become flushable "
"even if it is not large.");
TAG_FLAG(flush_threshold_secs, experimental);
TAG_FLAG(flush_threshold_secs, runtime);
+DEFINE_int32(flush_upper_bound_ms, 60 * 60 * 1000,
+ "Number of milliseconds after which the time-based performance improvement "
+ "score of a non-empty MRS/DMS flush op will reach its maximum value. "
+ "The score may further increase as the MRS/DMS grows in size.");
+TAG_FLAG(flush_upper_bound_ms, experimental);
+TAG_FLAG(flush_upper_bound_ms, runtime);
+
DECLARE_bool(enable_workload_score_for_perf_improvement_ops);
METRIC_DEFINE_gauge_uint32(tablet, log_gc_running,
@@ -93,10 +99,6 @@ namespace kudu {
namespace tablet {
using std::map;
-using strings::Substitute;
-
-// Upper bound for how long it takes to reach "full perf improvement" in time-based flushing.
-const double kFlushUpperBoundMs = 60 * 60 * 1000;
//
// FlushOpPerfImprovementPolicy.
@@ -106,27 +108,25 @@ void FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(MaintenanceOpStats
double elapsed_ms) {
double anchored_mb = static_cast<double>(stats->ram_anchored()) / (1024 * 1024);
const double threshold_mb = FLAGS_flush_threshold_mb;
- if (anchored_mb > threshold_mb) {
+ const double upper_bound_ms = FLAGS_flush_upper_bound_ms;
+ if (anchored_mb >= threshold_mb) {
// If we're over the user-specified flush threshold, then consider the perf
- // improvement to be 1 for every extra MB. This produces perf_improvement results
- // which are much higher than most compactions would produce, and means that, when
- // there is an MRS over threshold, a flush will almost always be selected instead of
- // a compaction. That's not necessarily a good thing, but in the absence of better
+ // improvement to be 1 for every extra MB (at least 1). This produces perf_improvement
+ // results which are much higher than most compactions would produce, and means that,
+ // when there is an MRS over threshold, a flush will almost always be selected instead of
+ // a compaction. That's not necessarily a good thing, but in the absence of better
// heuristics, it will do for now.
double extra_mb = anchored_mb - threshold_mb;
DCHECK_GE(extra_mb, 0);
- stats->set_perf_improvement(extra_mb);
+ stats->set_perf_improvement(std::max(1.0, extra_mb));
} else if (elapsed_ms > FLAGS_flush_threshold_secs * 1000) {
// Even if we aren't over the threshold, consider flushing if we haven't flushed
// in a long time. But, don't give it a large perf_improvement score. We should
// only do this if we really don't have much else to do, and if we've already waited a bit.
- // The following will give an improvement that's between 0.0 and 1.0, gradually growing
- // as 'elapsed_ms' approaches 'kFlushUpperBoundMs'.
- double perf = elapsed_ms / kFlushUpperBoundMs;
- if (perf > 1.0) {
- perf = 1.0;
- }
- stats->set_perf_improvement(perf);
+ // The following will give an improvement that's between 0.0 and 1.0, gradually growing as
+ // 'elapsed_ms' approaches 'upper_bound_ms' or 'anchored_mb' approaches 'threshold_mb'.
+ double perf = std::max(elapsed_ms / upper_bound_ms, anchored_mb / threshold_mb);
+ stats->set_perf_improvement(std::min(1.0, perf));
}
}