You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/02/07 22:16:47 UTC

[kudu] 02/03: [tools] --load_imbalance_threshold flag for LA rebalancer

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 02792a8a1a740d8eeef98c35bebb492666c143eb
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Mon Feb 4 21:11:17 2019 -0800

    [tools] --load_imbalance_threshold flag for LA rebalancer
    
    Introduced the '--load_imbalance_threshold' run-time flag
    for the Kudu CLI rebalancer tool.
    
    The threshold represents a policy with regard to what to
    prefer during cross-location rebalancing: ideal balance of the
    cross-location load on per-table basis (lower threshold value)
    or minimum number of replica moves between locations
    (greater threshold value).  To some extent, this new flag might
    be considered as an extension of already existing
    '--disable_cross_location_rebalancing' flag, whereas the new
    flag provides more control over the 'perfect balance vs minimum
    replica movements' choice for the cross-location rebalancing phase.
    
    Change-Id: I81867f168dac9908a701b50205be7bfaacefd554
    Reviewed-on: http://gerrit.cloudera.org:8080/12366
    Tested-by: Kudu Jenkins
    Reviewed-by: Will Berkeley <wd...@gmail.com>
---
 src/kudu/tools/rebalance_algo-test.cc |  2 +-
 src/kudu/tools/rebalance_algo.cc      | 18 ++++++++++--------
 src/kudu/tools/rebalance_algo.h       | 10 +++++++---
 src/kudu/tools/rebalancer.cc          | 18 ++++++++++++------
 src/kudu/tools/rebalancer.h           | 21 ++++++++++++++-------
 src/kudu/tools/tool_action_cluster.cc | 20 +++++++++++++++++++-
 6 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/src/kudu/tools/rebalance_algo-test.cc b/src/kudu/tools/rebalance_algo-test.cc
index c8ba5c7..2f7ddf3 100644
--- a/src/kudu/tools/rebalance_algo-test.cc
+++ b/src/kudu/tools/rebalance_algo-test.cc
@@ -220,7 +220,7 @@ void VerifyLocationRebalancingMoves(const TestClusterConfig& cfg) {
   {
     ClusterInfo ci;
     ClusterConfigToClusterInfo(cfg, &ci);
-    LocationBalancingAlgo algo;
+    LocationBalancingAlgo algo(1.0);
     ASSERT_OK(algo.GetNextMoves(ci, 0, &moves));
   }
   switch (cfg.ref_comparison_options.moves_ordering) {
diff --git a/src/kudu/tools/rebalance_algo.cc b/src/kudu/tools/rebalance_algo.cc
index a006258..8065f51 100644
--- a/src/kudu/tools/rebalance_algo.cc
+++ b/src/kudu/tools/rebalance_algo.cc
@@ -403,6 +403,10 @@ Status TwoDimensionalGreedyAlgo::GetMinMaxLoadedServers(
   return Status::OK();
 }
 
+LocationBalancingAlgo::LocationBalancingAlgo(double load_imbalance_threshold)
+    : load_imbalance_threshold_(load_imbalance_threshold) {
+}
+
 Status LocationBalancingAlgo::GetNextMove(
     const ClusterInfo& cluster_info,
     boost::optional<TableReplicaMove>* move) {
@@ -499,7 +503,7 @@ Status LocationBalancingAlgo::GetNextMove(
 
 bool LocationBalancingAlgo::IsBalancingNeeded(
     const TableByLoadImbalance& imbalance_info,
-    string* most_imbalanced_table_id) {
+    string* most_imbalanced_table_id) const {
   if (PREDICT_FALSE(VLOG_IS_ON(1))) {
     ostringstream ss;
     ss << "Table imbalance report: " << endl;
@@ -517,18 +521,16 @@ bool LocationBalancingAlgo::IsBalancingNeeded(
   // Evaluate the maximum existing imbalance: is it possible to move replicas
   // between tablet servers in different locations to make the skew less?
   //
-  // TODO(aserbin): detect 'good enough' vs ideal cases, like (b) vs (a) in
-  //                the class-wide comment. In other words, find the minimum
-  //                load imbalance down to which it makes sense to try
-  //                cross-location rebalancing. Probably, it should be a policy
-  //                wrt what to prefer: ideal location-wide balance or minimum
-  //                number of replica moves between locations?
+  // Empirically, the imbalance threshold is to detect 'good enough' vs 'ideal'
+  // cases, like (b) vs (a) in the class-wide comment. In other words, this
+  // delta if the minimum load imbalance down to which it makes sense to try
+  // cross-location rebalancing.
   //
   // The information on the most imbalanced table is in the last element
   // of the map.
   const auto it = imbalance_info.crbegin();
   const auto imbalance = it->first;
-  if (imbalance > 1) {
+  if (imbalance > load_imbalance_threshold_) {
     *most_imbalanced_table_id = it->second;
     return true;
   }
diff --git a/src/kudu/tools/rebalance_algo.h b/src/kudu/tools/rebalance_algo.h
index af9dde0..3b101c8 100644
--- a/src/kudu/tools/rebalance_algo.h
+++ b/src/kudu/tools/rebalance_algo.h
@@ -254,6 +254,9 @@ class TwoDimensionalGreedyAlgo : public RebalancingAlgo {
 // We want to move replicas to make the distribution (c) more balanced;
 // 2 movements gives us the 'ideal' location-wise replica placement.
 class LocationBalancingAlgo : public RebalancingAlgo {
+ public:
+  explicit LocationBalancingAlgo(double load_imbalance_threshold);
+
  protected:
   Status GetNextMove(const ClusterInfo& cluster_info,
                      boost::optional<TableReplicaMove>* move) override;
@@ -266,9 +269,8 @@ class LocationBalancingAlgo : public RebalancingAlgo {
   // if rebalancing is needed, 'false' otherwise. Upon returning 'true',
   // the identifier of the most cross-location imbalanced table is output into
   // the 'most_imbalanced_table_id' parameter (which must not be null).
-  static bool IsBalancingNeeded(
-      const TableByLoadImbalance& imbalance_info,
-      std::string* most_imbalanced_table_id);
+  bool IsBalancingNeeded(const TableByLoadImbalance& imbalance_info,
+                         std::string* most_imbalanced_table_id) const;
 
   // Given the set of the most and the least table-wise loaded locations, choose
   // the source and destination tablet server to move a replica of the specified
@@ -280,6 +282,8 @@ class LocationBalancingAlgo : public RebalancingAlgo {
       const std::vector<std::string>& loc_loaded_most,
       const ClusterInfo& cluster_info,
       boost::optional<TableReplicaMove>* move);
+
+  const double load_imbalance_threshold_;
 };
 
 } // namespace tools
diff --git a/src/kudu/tools/rebalancer.cc b/src/kudu/tools/rebalancer.cc
index 72b81c5..934d42e 100644
--- a/src/kudu/tools/rebalancer.cc
+++ b/src/kudu/tools/rebalancer.cc
@@ -89,7 +89,8 @@ Rebalancer::Config::Config(
     bool output_replica_distribution_details,
     bool run_policy_fixer,
     bool run_cross_location_rebalancing,
-    bool run_intra_location_rebalancing)
+    bool run_intra_location_rebalancing,
+    double load_imbalance_threshold)
     : master_addresses(std::move(master_addresses)),
       table_filters(std::move(table_filters)),
       max_moves_per_server(max_moves_per_server),
@@ -99,7 +100,8 @@ Rebalancer::Config::Config(
       output_replica_distribution_details(output_replica_distribution_details),
       run_policy_fixer(run_policy_fixer),
       run_cross_location_rebalancing(run_cross_location_rebalancing),
-      run_intra_location_rebalancing(run_intra_location_rebalancing) {
+      run_intra_location_rebalancing(run_intra_location_rebalancing),
+      load_imbalance_threshold(load_imbalance_threshold) {
   DCHECK_GE(max_moves_per_server, 0);
 }
 
@@ -225,7 +227,10 @@ Status Rebalancer::Run(RunStatus* result_status, size_t* moves_count) {
     if (config_.run_cross_location_rebalancing) {
       // Run the rebalancing across locations (inter-location rebalancing).
       LOG(INFO) << "running cross-location rebalancing";
-      CrossLocationRunner runner(this, config_.max_moves_per_server, deadline);
+      CrossLocationRunner runner(this,
+                                 config_.max_moves_per_server,
+                                 config_.load_imbalance_threshold,
+                                 deadline);
       RETURN_NOT_OK(runner.Init(config_.master_addresses));
       RETURN_NOT_OK(RunWith(&runner, result_status));
       moves_count_total += runner.moves_count();
@@ -1400,11 +1405,12 @@ Rebalancer::IntraLocationRunner::IntraLocationRunner(
       location_(std::move(location)) {
 }
 
-Rebalancer::CrossLocationRunner::CrossLocationRunner(
-    Rebalancer* rebalancer,
+Rebalancer::CrossLocationRunner::CrossLocationRunner(Rebalancer* rebalancer,
     size_t max_moves_per_server,
+    double load_imbalance_threshold,
     boost::optional<MonoTime> deadline)
-    : AlgoBasedRunner(rebalancer, max_moves_per_server, std::move(deadline)) {
+    : AlgoBasedRunner(rebalancer, max_moves_per_server, std::move(deadline)),
+      algorithm_(load_imbalance_threshold) {
 }
 
 Rebalancer::PolicyFixer::PolicyFixer(
diff --git a/src/kudu/tools/rebalancer.h b/src/kudu/tools/rebalancer.h
index cbaef49..d42cda5 100644
--- a/src/kudu/tools/rebalancer.h
+++ b/src/kudu/tools/rebalancer.h
@@ -63,6 +63,8 @@ class Rebalancer {
  public:
   // Configuration parameters for the rebalancer aggregated into a struct.
   struct Config {
+    static constexpr double kLoadImbalanceThreshold = 1.0;
+
     Config(std::vector<std::string> master_addresses = {},
            std::vector<std::string> table_filters = {},
            size_t max_moves_per_server = 5,
@@ -72,7 +74,8 @@ class Rebalancer {
            bool output_replica_distribution_details = false,
            bool run_policy_fixer = true,
            bool run_cross_location_rebalancing = true,
-           bool run_intra_location_rebalancing = true);
+           bool run_intra_location_rebalancing = true,
+           double load_imbalance_threshold = kLoadImbalanceThreshold);
 
     // Kudu masters' RPC endpoints.
     std::vector<std::string> master_addresses;
@@ -108,18 +111,22 @@ class Rebalancer {
     // policy violations. Fixing placement policy violations involves moving
     // tablet replicas across different locations in the cluster.
     // This setting is applicable to multi-location clusters only.
-    bool run_policy_fixer = true;
+    bool run_policy_fixer;
 
     // In case of multi-location cluster, whether to move tablet replicas
     // between locations in attempt to spread tablet replicas among location
     // evenly (equalizing loads of locations throughout the cluster).
     // This setting is applicable to multi-location clusters only.
-    bool run_cross_location_rebalancing = true;
+    bool run_cross_location_rebalancing;
 
     // In case of multi-location cluster, whether to rebalance tablet replica
     // distribution within each location.
     // This setting is applicable to multi-location clusters only.
-    bool run_intra_location_rebalancing = true;
+    bool run_intra_location_rebalancing;
+
+    // The per-table location load imbalance threshold for the cross-location
+    // balancing algorithm.
+    double load_imbalance_threshold;
   };
 
   // Represents a concrete move of a replica from one tablet server to another.
@@ -353,10 +360,13 @@ class Rebalancer {
    public:
     // The 'max_moves_per_server' specifies the maximum number of operations
     // per tablet server (both the source and the destination are counted in).
+    // The 'load_imbalance_threshold' specified the threshold for the
+    // balancing algorithm used for finding the most optimal replica movements.
     // The 'deadline' specifies the deadline for the run, 'boost::none'
     // if no timeout is set.
     CrossLocationRunner(Rebalancer* rebalancer,
                         size_t max_moves_per_server,
+                        double load_imbalance_threshold,
                         boost::optional<MonoTime> deadline);
 
     RebalancingAlgo* algorithm() override {
@@ -396,9 +406,6 @@ class Rebalancer {
 
     bool FindNextMove(ReplicaMove* move);
 
-    // An instance of the balancing algorithm.
-    LocationBalancingAlgo algorithm_;
-
     // Moves yet to schedule.
     MovesToSchedule moves_to_schedule_;
   };
diff --git a/src/kudu/tools/tool_action_cluster.cc b/src/kudu/tools/tool_action_cluster.cc
index 59d5f00..db295e5 100644
--- a/src/kudu/tools/tool_action_cluster.cc
+++ b/src/kudu/tools/tool_action_cluster.cc
@@ -125,6 +125,22 @@ DEFINE_bool(disable_intra_location_rebalancing, false,
             "replica distribution within each location. "
             "This setting is applicable to multi-location clusters only.");
 
+DEFINE_double(load_imbalance_threshold,
+              kudu::tools::Rebalancer::Config::kLoadImbalanceThreshold,
+              "The threshold for the per-table location load imbalance. "
+              "The threshold is used during the cross-location rebalancing "
+              "phase. If the measured cross-location load imbalance for a "
+              "table is greater than the specified threshold, the rebalancer "
+              "tries to move table's replicas to reduce the imbalance. "
+              "The recommended range for the threshold is [0.5, ...) with the "
+              "default value of 1.0. The threshold represents a policy "
+              "wrt what to prefer: either ideal balance of the cross-location "
+              "load on per-table basis (lower threshold value) or minimum "
+              "number of replica movements between locations "
+              "(greater threshold value). The default value is empirically "
+              "proven to be a good choice between 'ideal' and 'good enough' "
+              "replica distributions.");
+
 static bool ValidateMoveSingleReplicas(const char* flag_name,
                                        const string& flag_value) {
   const vector<string> allowed_values = { "auto", "enabled", "disabled" };
@@ -288,7 +304,8 @@ Status RunRebalance(const RunnerContext& context) {
       FLAGS_output_replica_distribution_details,
       !FLAGS_disable_policy_fixer,
       !FLAGS_disable_cross_location_rebalancing,
-      !FLAGS_disable_intra_location_rebalancing));
+      !FLAGS_disable_intra_location_rebalancing,
+      FLAGS_load_imbalance_threshold));
 
   // Print info on pre-rebalance distribution of replicas.
   RETURN_NOT_OK(rebalancer.PrintStats(cout));
@@ -378,6 +395,7 @@ unique_ptr<Mode> BuildClusterMode() {
         .AddOptionalParameter("disable_policy_fixer")
         .AddOptionalParameter("disable_cross_location_rebalancing")
         .AddOptionalParameter("disable_intra_location_rebalancing")
+        .AddOptionalParameter("load_imbalance_threshold")
         .AddOptionalParameter("max_moves_per_server")
         .AddOptionalParameter("max_run_time_sec")
         .AddOptionalParameter("max_staleness_interval_sec")