You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/11/20 07:03:42 UTC
[kudu] branch master updated: rebalancer: fix on
Runner::UpdateMovesInProgressStatus()
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 93b4561 rebalancer: fix on Runner::UpdateMovesInProgressStatus()
93b4561 is described below
commit 93b4561e36a3b06003b75f0c5f1ba691811605bc
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Tue Nov 19 19:17:51 2019 +0800
rebalancer: fix on Runner::UpdateMovesInProgressStatus()
This patch added a new parameter 'has_pending_moves' for
Runner::UpdateMovesInProgressStatus(), see the comments for
this method.
Before this patch, the return value of this method was ambiguous,
and there was a bug in PolicyFixer: When no pending operation
completed, the rebalancer tool would try to get a new set of moves
from the algorithm and perform it. Now, the rebalancer would do this
only in case of an error or no pending movement.
Change-Id: I8a25574d13f36a8f4f98f4340aa2086711b1c741
Reviewed-on: http://gerrit.cloudera.org:8080/14742
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Kudu Jenkins
---
src/kudu/rebalance/rebalancer.h | 19 +++++++++++++------
src/kudu/tools/rebalancer_tool.cc | 29 +++++++++++++++++------------
src/kudu/tools/rebalancer_tool.h | 8 ++++++--
3 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/src/kudu/rebalance/rebalancer.h b/src/kudu/rebalance/rebalancer.h
index 50be60c..ed6fcfc 100644
--- a/src/kudu/rebalance/rebalancer.h
+++ b/src/kudu/rebalance/rebalancer.h
@@ -170,12 +170,19 @@ class Rebalancer {
virtual bool ScheduleNextMove(bool* has_errors, bool* timed_out) = 0;
// Update statuses and auxiliary information on in-progress replica move
- // operations. The 'timed_out' parameter is set to 'true' if not all
- // in-progress operations were processed by the deadline specified by
- // the 'deadline_' member field. The method returns 'true' if it's necessary
- // to clear the state of the in-progress operations, i.e. 'forget'
- // those, starting from a clean state.
- virtual bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) = 0;
+ // operations.
+ // The 'timed_out' parameter is set to 'true' if not all in-progress
+ // operations were processed by the deadline specified by the 'deadline_'
+ // member field.
+ // The 'has_errors' parameter is set to 'true' if there were errors while
+ // trying to get the statuses of in-progress operations.
+ // The 'has_pending_moves' parameter is set to 'true' if there were any
+ // in-progress operations and we could check their statuses.
+ // The method returns 'true' if at least one in-progress move operation was
+ // completed (success or failure).
+ virtual bool UpdateMovesInProgressStatus(bool* has_errors,
+ bool* timed_out,
+ bool* has_pending_moves) = 0;
virtual Status GetNextMoves(bool* has_moves) = 0;
diff --git a/src/kudu/tools/rebalancer_tool.cc b/src/kudu/tools/rebalancer_tool.cc
index a7ebfe5..d05e790 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -545,21 +545,25 @@ Status RebalancerTool::RunWith(Runner* runner, RunStatus* result_status) {
}
// Poll for the status of pending operations. If some of the in-flight
- // operations are complete, it might be possible to schedule new ones
+ // operations are completed, it might be possible to schedule new ones
// by calling Runner::ScheduleNextMove().
+ auto has_pending_moves = false;
auto has_updates = runner->UpdateMovesInProgressStatus(&has_errors,
- &is_timed_out);
+ &is_timed_out,
+ &has_pending_moves);
if (has_updates) {
// Reset the start of the staleness interval: there was some updates
// on the status of scheduled move operations.
staleness_start = MonoTime::Now();
+ // Continue scheduling available move operations.
+ continue;
}
resync_state |= has_errors;
- if (resync_state || is_timed_out || !has_updates) {
+ if (resync_state || is_timed_out || !has_pending_moves) {
// If there were errors while trying to get the statuses of pending
// operations it's necessary to re-synchronize the state of the cluster:
// most likely something has changed, so it's better to get a new set
- // of planned moves.
+ // of planned moves. Also, do the same if not a single pending move has left.
break;
}
@@ -792,11 +796,10 @@ bool RebalancerTool::AlgoBasedRunner::ScheduleNextMove(bool* has_errors,
}
bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
- bool* has_errors, bool* timed_out) {
+ bool* has_errors, bool* timed_out, bool* has_pending_moves) {
DCHECK(has_errors);
DCHECK(timed_out);
- *has_errors = false;
- *timed_out = false;
+ DCHECK(has_pending_moves);
// Update the statuses of the in-progress move operations.
auto has_updates = false;
@@ -815,7 +818,7 @@ bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
const Status s = CheckCompleteMove(master_addresses_, client_,
tablet_id, src_ts_uuid, dst_ts_uuid,
&is_complete, &move_status);
- has_updates |= s.ok();
+ *has_pending_moves |= s.ok();
if (!s.ok()) {
// There was an error while fetching the status of this move operation.
// Since the actual status of the move is not known, don't update the
@@ -833,6 +836,7 @@ bool RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
// The move has completed (success or failure): update the stats on the
// pending operations per server.
++moves_count_;
+ has_updates = true;
UpdateOnMoveCompleted(it->second.ts_uuid_from);
UpdateOnMoveCompleted(it->second.ts_uuid_to);
LOG(INFO) << Substitute("tablet $0: '$1' -> '$2' move completed: $3",
@@ -1167,16 +1171,17 @@ bool RebalancerTool::PolicyFixer::ScheduleNextMove(bool* has_errors,
}
bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
- bool* has_errors, bool* timed_out) {
+ bool* has_errors, bool* timed_out, bool* has_pending_moves) {
DCHECK(has_errors);
DCHECK(timed_out);
+ DCHECK(has_pending_moves);
+ // Update the statuses of the in-progress move operations.
auto has_updates = false;
auto error_count = 0;
- auto out_of_time = false;
for (auto it = scheduled_moves_.begin(); it != scheduled_moves_.end(); ) {
if (deadline_ && MonoTime::Now() >= *deadline_) {
- out_of_time = true;
+ *timed_out = true;
break;
}
bool is_complete;
@@ -1185,6 +1190,7 @@ bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
const auto& ts_uuid = it->second.ts_uuid_from;
auto s = CheckCompleteReplace(client_, tablet_id, ts_uuid,
&is_complete, &completion_status);
+ *has_pending_moves |= s.ok();
if (!s.ok()) {
// Update on the movement status has failed: remove the move operation
// as if it didn't exist. Once the cluster status is re-synchronized,
@@ -1209,7 +1215,6 @@ bool RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
++it;
}
- *timed_out = out_of_time;
*has_errors = (error_count > 0);
return has_updates;
}
diff --git a/src/kudu/tools/rebalancer_tool.h b/src/kudu/tools/rebalancer_tool.h
index ac7dbe3..371031f 100644
--- a/src/kudu/tools/rebalancer_tool.h
+++ b/src/kudu/tools/rebalancer_tool.h
@@ -155,7 +155,9 @@ class RebalancerTool : public rebalance::Rebalancer {
bool ScheduleNextMove(bool* has_errors, bool* timed_out) override;
- bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) override;
+ bool UpdateMovesInProgressStatus(bool* has_errors,
+ bool* timed_out,
+ bool* has_pending_moves) override;
// Get the cluster location the runner is slated to run/running at.
// 'boost::none' means all the cluster.
@@ -278,7 +280,9 @@ class RebalancerTool : public rebalance::Rebalancer {
bool ScheduleNextMove(bool* has_errors, bool* timed_out) override;
- bool UpdateMovesInProgressStatus(bool* has_errors, bool* timed_out) override;
+ bool UpdateMovesInProgressStatus(bool* has_errors,
+ bool* timed_out,
+ bool* has_pending_moves) override;
private:
// Key is tserver UUID which corresponds to value.ts_uuid_from.