You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/11/20 07:03:42 UTC

[kudu] branch master updated: rebalancer: fix on Runner::UpdateMovesInProgressStatus()

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 93b4561  rebalancer: fix on Runner::UpdateMovesInProgressStatus()
93b4561 is described below

commit 93b4561e36a3b06003b75f0c5f1ba691811605bc
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Tue Nov 19 19:17:51 2019 +0800

    rebalancer: fix on Runner::UpdateMovesInProgressStatus()
    
    This patch added a new parameter 'has_pending_moves' for
    Runner::UpdateMovesInProgressStatus(), see the comments for
    this method.
    
    Before this patch, the return value of this method was ambiguous,
    and there was a bug in PolicyFixer: When no pending operation
    completed, the rebalancer tool would try to get a new set of moves
    from the algorithm and perform it. Now, the rebalancer would do this
    only in case of an error or no pending movement.
    
    Change-Id: I8a25574d13f36a8f4f98f4340aa2086711b1c741
    Reviewed-on: http://gerrit.cloudera.org:8080/14742
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/rebalance/rebalancer.h   | 19 +++++++++++++------
 src/kudu/tools/rebalancer_tool.cc | 29 +++++++++++++++++------------
 src/kudu/tools/rebalancer_tool.h  |  8 ++++++--
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/src/kudu/rebalance/rebalancer.h b/src/kudu/rebalance/rebalancer.h
index 50be60c..ed6fcfc 100644
--- a/src/kudu/rebalance/rebalancer.h
+++ b/src/kudu/rebalance/rebalancer.h
@@ -170,12 +170,19 @@ class Rebalancer {
     virtual bool ScheduleNextMove(bool* has_errors, bool* timed_out) = 0;
 
     // Update statuses and auxiliary information on in-progress replica move
-    // operations. The 'timed_out' parameter is set to 'true' if not all
-    // in-progress operations were processed by the deadline specified by
-    // the 'deadline_' member field. The method returns 'true' if it's necessary
-    // to clear the state of the in-progress operations, i.e. 'forget'
-    // those, starting from a clean state.
-    virtual bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) = 0;
+    // operations.
+    // The 'timed_out' parameter is set to 'true' if not all in-progress
+    // operations were processed by the deadline specified by the 'deadline_'
+    // member field.
+    // The 'has_errors' parameter is set to 'true' if there were errors while
+    // trying to get the statuses of in-progress operations.
+    // The 'has_pending_moves' parameter is set to 'true' if there were any
+    // in-progress operations and we could check their statuses.
+    // The method returns 'true' if at least one in-progress move operation was
+    // completed (success or failure).
+    virtual bool UpdateMovesInProgressStatus(bool* has_errors,
+                                             bool* timed_out,
+                                             bool* has_pending_moves) = 0;
 
     virtual Status GetNextMoves(bool* has_moves) = 0;
 
diff --git a/src/kudu/tools/rebalancer_tool.cc b/src/kudu/tools/rebalancer_tool.cc
index a7ebfe5..d05e790 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -545,21 +545,25 @@ Status RebalancerTool::RunWith(Runner* runner, RunStatus* result_status) {
       }
 
       // Poll for the status of pending operations. If some of the in-flight
-      // operations are complete, it might be possible to schedule new ones
+      // operations are completed, it might be possible to schedule new ones
       // by calling Runner::ScheduleNextMove().
+      auto has_pending_moves = false;
       auto has_updates = runner->UpdateMovesInProgressStatus(&has_errors,
-                                                             &is_timed_out);
+                                                             &is_timed_out,
+                                                             &has_pending_moves);
       if (has_updates) {
         // Reset the start of the staleness interval: there was some updates
         // on the status of scheduled move operations.
         staleness_start = MonoTime::Now();
+        // Continue scheduling available move operations.
+        continue;
       }
       resync_state |= has_errors;
-      if (resync_state || is_timed_out || !has_updates) {
+      if (resync_state || is_timed_out || !has_pending_moves) {
         // If there were errors while trying to get the statuses of pending
         // operations it's necessary to re-synchronize the state of the cluster:
         // most likely something has changed, so it's better to get a new set
-        // of planned moves.
+        // of planned moves. Also, do the same if not a single pending move has left.
         break;
       }
 
@@ -792,11 +796,10 @@ bool RebalancerTool::AlgoBasedRunner::ScheduleNextMove(bool* has_errors,
 }
 
 bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
-    bool* has_errors, bool* timed_out) {
+    bool* has_errors, bool* timed_out, bool* has_pending_moves) {
   DCHECK(has_errors);
   DCHECK(timed_out);
-  *has_errors = false;
-  *timed_out = false;
+  DCHECK(has_pending_moves);
 
   // Update the statuses of the in-progress move operations.
   auto has_updates = false;
@@ -815,7 +818,7 @@ bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
     const Status s = CheckCompleteMove(master_addresses_, client_,
                                        tablet_id, src_ts_uuid, dst_ts_uuid,
                                        &is_complete, &move_status);
-    has_updates |= s.ok();
+    *has_pending_moves |= s.ok();
     if (!s.ok()) {
       // There was an error while fetching the status of this move operation.
       // Since the actual status of the move is not known, don't update the
@@ -833,6 +836,7 @@ bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
       // The move has completed (success or failure): update the stats on the
       // pending operations per server.
       ++moves_count_;
+      has_updates = true;
       UpdateOnMoveCompleted(it->second.ts_uuid_from);
       UpdateOnMoveCompleted(it->second.ts_uuid_to);
       LOG(INFO) << Substitute("tablet $0: '$1' -> '$2' move completed: $3",
@@ -1167,16 +1171,17 @@ bool RebalancerTool::PolicyFixer::ScheduleNextMove(bool* has_errors,
 }
 
 bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
-    bool* has_errors, bool* timed_out) {
+    bool* has_errors, bool* timed_out, bool* has_pending_moves) {
   DCHECK(has_errors);
   DCHECK(timed_out);
+  DCHECK(has_pending_moves);
 
+  // Update the statuses of the in-progress move operations.
   auto has_updates = false;
   auto error_count = 0;
-  auto out_of_time = false;
   for (auto it = scheduled_moves_.begin(); it != scheduled_moves_.end(); ) {
     if (deadline_ && MonoTime::Now() >= *deadline_) {
-      out_of_time = true;
+      *timed_out = true;
       break;
     }
     bool is_complete;
@@ -1185,6 +1190,7 @@ bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
     const auto& ts_uuid = it->second.ts_uuid_from;
     auto s = CheckCompleteReplace(client_, tablet_id, ts_uuid,
                                   &is_complete, &completion_status);
+    *has_pending_moves |= s.ok();
     if (!s.ok()) {
       // Update on the movement status has failed: remove the move operation
       // as if it didn't exist. Once the cluster status is re-synchronized,
@@ -1209,7 +1215,6 @@ bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
     ++it;
   }
 
-  *timed_out = out_of_time;
   *has_errors = (error_count > 0);
   return has_updates;
 }
diff --git a/src/kudu/tools/rebalancer_tool.h b/src/kudu/tools/rebalancer_tool.h
index ac7dbe3..371031f 100644
--- a/src/kudu/tools/rebalancer_tool.h
+++ b/src/kudu/tools/rebalancer_tool.h
@@ -155,7 +155,9 @@ class RebalancerTool : public rebalance::Rebalancer {
 
     bool ScheduleNextMove(bool* has_errors, bool* timed_out) override;
 
-    bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) override;
+    bool UpdateMovesInProgressStatus(bool* has_errors,
+                                     bool* timed_out,
+                                     bool* has_pending_moves) override;
 
     // Get the cluster location the runner is slated to run/running at.
     // 'boost::none' means all the cluster.
@@ -278,7 +280,9 @@ class RebalancerTool : public rebalance::Rebalancer {
 
     bool ScheduleNextMove(bool* has_errors, bool* timed_out) override;
 
-    bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) override;
+    bool UpdateMovesInProgressStatus(bool* has_errors,
+                                     bool* timed_out,
+                                     bool* has_pending_moves) override;
 
    private:
     // Key is tserver UUID which corresponds to value.ts_uuid_from.