You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2020/01/23 21:56:00 UTC
[kudu] branch master updated: KUDU-3011 p7: add tool to quiesce
server
This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 14cbaee KUDU-3011 p7: add tool to quiesce server
14cbaee is described below
commit 14cbaeea009d5787dbf1b830a7e62964dad7a53a
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Wed Jan 8 23:29:44 2020 -0800
KUDU-3011 p7: add tool to quiesce server
Adds the following commands:
$ kudu tserver quiesce start <tserver_addr>
- sets the server to start quiescing
- optionally, if --error_if_not_fully_quiesced is true, the server
will respond with the number of leaders and active scanners, and if
either are non-zero, the tool will return an error
- users can orchestrate this to wait for a fully quiesced server. An
example of this is shown in RollingRestartITest.
$ kudu tserver quiesce stop <tserver_addr>
- sets the server to not be quiescing
Tests:
- Added some tests to exercise the quiescing tool in the context of a
rolling restart alongside the maintenance mode tooling.
- Also added some basic testing for the quiescing tooling alone.
Change-Id: I89657808cc2b0afc4e1b37ce75937ab12e098d9c
Reviewed-on: http://gerrit.cloudera.org:8080/15091
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
src/kudu/integration-tests/CMakeLists.txt | 3 +-
.../integration-tests/maintenance_mode-itest.cc | 348 ++++++++++++++++++++-
.../tablet_server_quiescing-itest.cc | 90 +++++-
src/kudu/integration-tests/test_workload.cc | 20 +-
src/kudu/integration-tests/test_workload.h | 26 +-
src/kudu/tools/kudu-tool-test.cc | 8 +
src/kudu/tools/tool_action_tserver.cc | 80 ++++-
src/kudu/tools/tool_test_util.cc | 14 +
src/kudu/tools/tool_test_util.h | 4 +
src/kudu/tserver/tablet_service.cc | 23 ++
src/kudu/tserver/tablet_service.h | 64 ++--
src/kudu/tserver/tserver_admin.proto | 26 ++
12 files changed, 651 insertions(+), 55 deletions(-)
diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt
index 4dd2f4c..dbbd80c 100644
--- a/src/kudu/integration-tests/CMakeLists.txt
+++ b/src/kudu/integration-tests/CMakeLists.txt
@@ -84,7 +84,8 @@ ADD_KUDU_TEST(fuzz-itest RUN_SERIAL true)
ADD_KUDU_TEST(heavy-update-compaction-itest RUN_SERIAL true)
ADD_KUDU_TEST(linked_list-test RUN_SERIAL true)
ADD_KUDU_TEST(log-rolling-itest)
-ADD_KUDU_TEST(maintenance_mode-itest)
+ADD_KUDU_TEST(maintenance_mode-itest NUM_SHARDS 8
+ DATA_FILES ../scripts/assign-location.py)
ADD_KUDU_TEST(master_cert_authority-itest PROCESSORS 2)
ADD_KUDU_TEST(master_failover-itest NUM_SHARDS 4 PROCESSORS 3)
ADD_KUDU_TEST_DEPENDENCIES(master_failover-itest
diff --git a/src/kudu/integration-tests/maintenance_mode-itest.cc b/src/kudu/integration-tests/maintenance_mode-itest.cc
index 958c177..648802c 100644
--- a/src/kudu/integration-tests/maintenance_mode-itest.cc
+++ b/src/kudu/integration-tests/maintenance_mode-itest.cc
@@ -16,9 +16,10 @@
// under the License.
#include <cstdint>
-#include <cstdio>
+#include <functional>
#include <memory>
#include <string>
+#include <thread>
#include <unordered_map>
#include <utility>
#include <vector>
@@ -29,7 +30,10 @@
#include "kudu/consensus/consensus.pb.h"
#include "kudu/consensus/metadata.pb.h"
#include "kudu/gutil/map-util.h"
+#include "kudu/gutil/port.h"
#include "kudu/gutil/stl_util.h"
+#include "kudu/gutil/strings/join.h"
+#include "kudu/gutil/strings/substitute.h"
#include "kudu/integration-tests/cluster_itest_util.h"
#include "kudu/integration-tests/cluster_verifier.h"
#include "kudu/integration-tests/external_mini_cluster-itest-base.h"
@@ -42,8 +46,8 @@
#include "kudu/tools/tool_test_util.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
-#include "kudu/util/path_util.h"
#include "kudu/util/net/sockaddr.h"
+#include "kudu/util/path_util.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
@@ -58,19 +62,23 @@ using kudu::master::ChangeTServerStateResponsePB;
using kudu::cluster::ExternalDaemon;
using kudu::cluster::ExternalMiniClusterOptions;
using kudu::cluster::ExternalTabletServer;
+using kudu::cluster::LocationInfo;
using kudu::consensus::ConsensusStatePB;
using kudu::consensus::HealthReportPB;
using kudu::consensus::IncludeHealthReport;
using kudu::itest::GetInt64Metric;
using kudu::master::MasterServiceProxy;
using kudu::master::TServerStateChangePB;
-using kudu::tools::RunKuduTool;
+using kudu::tools::RunActionPrependStdoutStderr;
+using std::function;
using std::pair;
using std::shared_ptr;
using std::string;
+using std::thread;
using std::unique_ptr;
using std::unordered_map;
using std::vector;
+using strings::Substitute;
namespace kudu {
namespace itest {
@@ -82,7 +90,7 @@ static const vector<string> kTServerFlags = {
"--raft_heartbeat_interval_ms=100",
"--follower_unavailable_considered_failed_sec=2",
// Disable log GC in case our write workloads lead to eviction because
- // consensus will consider replicas that are too fare behind unrecoverable
+ // consensus will consider replicas that are too far behind unrecoverable
// and will evict them regardless of maintenance mode.
"--enable_log_gc=false",
};
@@ -334,19 +342,12 @@ TEST_F(MaintenanceModeRF3ITest, TestMaintenanceModeDoesntObstructMove) {
// While the maintenance mode tserver is still online, move a tablet from it.
// This should succeed, because maintenance mode will not obstruct manual
// movement of replicas.
- {
- vector<string> move_cmd = {
- "tablet",
- "change_config",
- "move_replica",
+ ASSERT_OK(RunActionPrependStdoutStderr(Substitute(
+ "tablet change_config move_replica $0 $1 $2 $3",
cluster_->master()->bound_rpc_addr().ToString(),
mnt_tablet_ids[0],
maintenance_uuid,
- added_uuid,
- };
- string stdout, stderr;
- ASSERT_OK(RunKuduTool(move_cmd, &stdout, &stderr));
- }
+ added_uuid)));
const TServerDetails* added_details = FindOrDie(ts_map, added_uuid);
ASSERT_EVENTUALLY([&] {
vector<string> added_tablet_ids;
@@ -487,5 +488,324 @@ TEST_F(MaintenanceModeRF5ITest, TestBackgroundFailureDuringMaintenanceMode) {
});
}
+namespace {
+
+// Performs the given tasks in parallel, returning an error if any of them
+// return a non-OK Status.
+Status DoInParallel(vector<function<Status()>> tasks,
+ const string& task_description) {
+ int num_in_parallel = tasks.size();
+ vector<Status> results(num_in_parallel);
+ vector<thread> threads;
+ for (int i = 0; i < num_in_parallel; i++) {
+ threads.emplace_back([&, i] {
+ Status s = tasks[i]();
+ if (PREDICT_FALSE(!s.ok())) {
+ results[i] = s;
+ }
+ });
+ }
+ for (auto& t : threads) {
+ t.join();
+ }
+ bool has_errors = false;
+ for (int i = 0; i < num_in_parallel; i++) {
+ const auto& s = results[i];
+ if (!s.ok()) {
+ LOG(ERROR) << s.ToString();
+ has_errors = true;
+ }
+ }
+ if (has_errors) {
+ return Status::IllegalState(
+ Substitute("errors while running $0 $1 tasks in parallel",
+ num_in_parallel, task_description));
+ }
+ return Status::OK();
+}
+
+// Repeats 'cmd' until it succeeds, with the given timeout and interval.
+Status RunUntilSuccess(const string& cmd, int timeout_secs, int repeat_interval_secs) {
+ Status s;
+ const MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(timeout_secs);
+ const MonoDelta retry_interval = MonoDelta::FromSeconds(repeat_interval_secs);
+ while (true) {
+ s = RunActionPrependStdoutStderr(cmd);
+ if (s.ok()) {
+ return Status::OK();
+ }
+ if (MonoTime::Now() + retry_interval > deadline) {
+ return Status::TimedOut(
+ Substitute("Running '$0' did not succeed in $1 seconds: $2",
+ cmd, timeout_secs, s.ToString()));
+ }
+ SleepFor(retry_interval);
+ }
+ return Status::OK();
+}
+
+// Generates locations such that there are a total of 'num_tservers' tablet
+// servers spread as evenly as possible across 'num_locs' locations.
+ExternalMiniClusterOptions GenerateOpts(int num_tservers, int num_locs) {
+ int tservers_per_loc = num_tservers / num_locs;
+ int tservers_with_one_more = num_tservers % num_locs;
+ CHECK_LT(0, tservers_per_loc);
+ LocationInfo locations;
+ for (int l = 0; l < num_locs; l++) {
+ string loc = Substitute("/L$0", l);
+ EmplaceOrDie(&locations, Substitute("/L$0", l),
+ tservers_with_one_more < l ? tservers_per_loc + 1 : tservers_per_loc);
+ }
+
+ ExternalMiniClusterOptions opts;
+ opts.num_tablet_servers = num_tservers;
+ opts.extra_master_flags = {
+ // Don't bother assigning locations to clients; it's hard to do that
+ // correctly with external mini clusters.
+ "--master_client_location_assignment_enabled=false",
+ };
+ opts.extra_tserver_flags = {
+ // To speed up leadership transfers from quiescing, let's make heartbeats
+ // more frequent.
+ "--raft_heartbeat_interval_ms=100",
+ // Let's emulate long-running scans by setting a low scan batch size.
+ "--scanner_default_batch_size_bytes=100",
+ };
+ opts.location_info = std::move(locations);
+ return opts;
+}
+
+struct RollingRestartTestArgs {
+ // Cluster opts.
+ ExternalMiniClusterOptions opts;
+ // Upper bound on the number of tablet servers to restart at the same time.
+ int batch_size;
+ // Replication to use for the test workload.
+ int num_replicas;
+ // Whether the rolling restart should fail.
+ bool restart_fails;
+};
+
+// Convenience builder for more readable test composition.
+class ArgsBuilder {
+ public:
+ ArgsBuilder& batch_size(int batch_size) {
+ args_.batch_size = batch_size;
+ return *this;
+ }
+ ArgsBuilder& num_locations(int num_locations) {
+ num_locations_ = num_locations;
+ return *this;
+ }
+ ArgsBuilder& num_replicas(int num_replicas) {
+ args_.num_replicas = num_replicas;
+ return *this;
+ }
+ ArgsBuilder& num_tservers(int num_tservers) {
+ num_tservers_ = num_tservers;
+ return *this;
+ }
+ ArgsBuilder& restart_fails(bool fails) {
+ args_.restart_fails = fails;
+ return *this;
+ }
+ RollingRestartTestArgs build() {
+ args_.opts = GenerateOpts(num_tservers_, num_locations_);
+ return args_;
+ }
+ private:
+ RollingRestartTestArgs args_;
+ int num_tservers_ = 0;
+ int num_locations_ = 0;
+};
+
+} // anonymous namespace
+
+class RollingRestartITest : public MaintenanceModeITest,
+ public ::testing::WithParamInterface<RollingRestartTestArgs> {
+ public:
+ void SetUp() override {
+ SKIP_IF_SLOW_NOT_ALLOWED();
+ NO_FATALS(MaintenanceModeITest::SetUp());
+ const auto& args = GetParam();
+ ExternalMiniClusterOptions opts = args.opts;
+ NO_FATALS(StartClusterWithOpts(std::move(opts)));
+ const auto& addr = cluster_->master(0)->bound_rpc_addr();
+ m_proxy_.reset(new MasterServiceProxy(cluster_->messenger(), addr, addr.host()));
+ NO_FATALS(GenerateTServerMap(&ts_map_and_deleter_));
+ }
+ void TearDown() override {
+ SKIP_IF_SLOW_NOT_ALLOWED();
+ NO_FATALS(MaintenanceModeITest::TearDown());
+ }
+
+ // Create a read-write workload that doesn't use a fault-tolerant scanner.
+ unique_ptr<TestWorkload> CreateFaultIntolerantRWWorkload(int num_replicas) {
+ unique_ptr<TestWorkload> rw_workload(new TestWorkload(cluster_.get()));
+ rw_workload->set_scanner_fault_tolerant(false);
+ rw_workload->set_num_replicas(num_replicas);
+ rw_workload->set_num_read_threads(3);
+ rw_workload->set_num_write_threads(3);
+ rw_workload->set_verify_num_rows(false);
+ return rw_workload;
+ }
+
+ // Returns a list of batches of tablet server UUIDs to restart in parallel,
+ // each of size at most 'batch_size'. Batches are generated within each
+ // location.
+ vector<vector<string>> GetRestartBatches(int batch_size) {
+ unordered_map<string, vector<string>> cur_tservers_by_loc;
+ vector<vector<string>> restart_batches;
+ for (const auto& ts_and_details : ts_map_and_deleter_.first) {
+ const auto& uuid = ts_and_details.first;
+ const auto& loc = ts_and_details.second->location;
+ auto& cur_batch = LookupOrInsert(&cur_tservers_by_loc, loc, {});
+ cur_batch.emplace_back(uuid);
+ // If we've reached our desired batch size for this location, put it in
+ // the return set.
+ if (cur_batch.size() >= batch_size) {
+ restart_batches.emplace_back(std::move(cur_batch));
+ CHECK_EQ(1, cur_tservers_by_loc.erase(loc));
+ }
+ }
+ // Create batches out of the remaining, suboptimally-sized batches.
+ for (auto& ts_and_batch : cur_tservers_by_loc) {
+ restart_batches.emplace_back(std::move(ts_and_batch.second));
+ }
+ return restart_batches;
+ }
+
+ // Takes the list of tablet servers and restarts them in parallel in such a
+ // way that shouldn't affect on-going workloads.
+ Status RollingRestartTServers(const vector<string>& ts_to_restart) {
+ // Begin maintenance mode on the servers so we'll stop assigning replicas
+ // to them, and begin quiescing them so the server itself stops accepting
+ // more work.
+ vector<function<Status()>> setup_rr_tasks;
+ for (const auto& ts_id : ts_to_restart) {
+ setup_rr_tasks.emplace_back([&, ts_id] {
+ RETURN_NOT_OK(RunActionPrependStdoutStderr(
+ Substitute("tserver state enter_maintenance $0 $1",
+ cluster_->master()->bound_rpc_addr().ToString(), ts_id)));
+ return RunUntilSuccess(
+ Substitute("tserver quiesce start $0 --error_if_not_fully_quiesced",
+ cluster_->tablet_server_by_uuid(ts_id)->bound_rpc_addr().ToString()),
+ /*timeout_secs*/30, /*repeat_interval_secs*/1);
+ });
+ }
+ RETURN_NOT_OK(DoInParallel(std::move(setup_rr_tasks), "rolling restart setup"));
+
+ // Restart the tservers. Note: we don't do this in parallel because
+ // wrangling multiple processes from different threads is messy.
+ for (const auto& ts_id : ts_to_restart) {
+ ExternalTabletServer* ts = cluster_->tablet_server_by_uuid(ts_id);
+ ts->Shutdown();
+ RETURN_NOT_OK(ts->Restart());
+ }
+
+ // Wait for ksck to become healthy.
+ RETURN_NOT_OK_PREPEND(RunUntilSuccess(
+ Substitute("cluster ksck $0", cluster_->master()->bound_rpc_addr().ToString()),
+ /*timeout_secs*/60, /*repeat_interval_secs*/5),
+ "cluster didn't become healthy");
+
+ // Now clean up persistent state.
+ vector<function<Status()>> cleanup_tasks;
+ for (const auto& ts_id : ts_to_restart) {
+ cleanup_tasks.emplace_back([&, ts_id] {
+ return RunActionPrependStdoutStderr(
+ Substitute("tserver state exit_maintenance $0 $1",
+ cluster_->master()->bound_rpc_addr().ToString(),
+ ts_id));
+
+ });
+ }
+ return DoInParallel(std::move(cleanup_tasks), "rolling restart cleanup");
+ }
+
+ protected:
+ MapAndDeleter ts_map_and_deleter_;
+};
+
+TEST_P(RollingRestartITest, TestWorkloads) {
+ SKIP_IF_SLOW_NOT_ALLOWED();
+ const auto& args = GetParam();
+ unique_ptr<TestWorkload> rw = CreateFaultIntolerantRWWorkload(args.num_replicas);
+ rw->set_read_timeout_millis(10000);
+ // If we're expecting the rolling restart to fail, e.g. because we can't
+ // fully quiesce our servers, chances are our workload can't complete either
+ // because too many servers are quiescing and none can serve scans or writes.
+ rw->set_read_errors_allowed(args.restart_fails);
+ rw->set_timeout_allowed(args.restart_fails);
+ rw->Setup();
+ rw->Start();
+ vector<vector<string>> restart_batches = GetRestartBatches(args.batch_size);
+ for (const auto& batch : restart_batches) {
+ LOG(INFO) << Substitute("Restarting batch of $0 tservers: $1",
+ batch.size(), JoinStrings(batch, ","));
+ if (args.restart_fails) {
+ ASSERT_EVENTUALLY([&] {
+ Status s = RollingRestartTServers(batch);
+ ASSERT_FALSE(s.ok());
+ });
+ } else {
+ ASSERT_OK(RollingRestartTServers(batch));
+ }
+ }
+ NO_FATALS(rw->StopAndJoin());
+ if (args.restart_fails) {
+ ASSERT_FALSE(rw->read_errors().empty());
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(RollingRestartArgs, RollingRestartITest, ::testing::Values(
+ // Basic RF=3 case.
+ ArgsBuilder().num_tservers(4)
+ .num_locations(1)
+ .batch_size(1)
+ .num_replicas(3)
+ .restart_fails(false)
+ .build(),
+ // Basic RF=5 case. The larger replication factor lets us increase the
+ // restart batch size.
+ ArgsBuilder().num_tservers(6)
+ .num_locations(1)
+ .batch_size(2)
+ .num_replicas(5)
+ .restart_fails(false)
+ .build(),
+ // RF=3 case with location awareness. The location awareness lets us
+ // increase the restart batch size.
+ ArgsBuilder().num_tservers(6)
+ .num_locations(3)
+ .batch_size(2)
+ .num_replicas(3)
+ .restart_fails(false)
+ .build(),
+ // RF=3 case with location awareness, but with an even larger batch size.
+ ArgsBuilder().num_tservers(9)
+ .num_locations(3)
+ .batch_size(3)
+ .num_replicas(3)
+ .restart_fails(false)
+ .build(),
+ // Basic RF=3 case, but with too large a batch size. With too large a batch
+ // size, the tablet servers won't be able to fully relinquish leadership
+ // and quiesce; the restart process should thus fail.
+ ArgsBuilder().num_tservers(4)
+ .num_locations(1)
+ .batch_size(4)
+ .num_replicas(3)
+ .restart_fails(true)
+ .build(),
+ // The same goes for the case with a single replica.
+ ArgsBuilder().num_tservers(1)
+ .num_locations(1)
+ .batch_size(1)
+ .num_replicas(1)
+ .restart_fails(true)
+ .build()
+));
+
} // namespace itest
} // namespace kudu
diff --git a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
index dde8883..73c6e40 100644
--- a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
+++ b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
@@ -37,12 +37,14 @@
#include "kudu/integration-tests/test_workload.h"
#include "kudu/mini-cluster/internal_mini_cluster.h"
#include "kudu/tablet/metadata.pb.h"
+#include "kudu/tools/tool_test_util.h"
#include "kudu/tserver/mini_tablet_server.h"
#include "kudu/tserver/scanners.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/ts_tablet_manager.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
+#include "kudu/util/net/sockaddr.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
@@ -64,6 +66,7 @@ using kudu::client::KuduScanBatch;
using kudu::client::KuduScanner;
using kudu::client::KuduTable;
using kudu::client::sp::shared_ptr;
+using kudu::tools::RunActionPrependStdoutStderr;
using kudu::tserver::MiniTabletServer;
using std::string;
using std::unique_ptr;
@@ -353,6 +356,89 @@ TEST_F(TServerQuiescingITest, TestQuiesceLeaderWhileFollowersCatchingUp) {
});
}
+// Basic test that we see the quiescing state change in the server.
+TEST_F(TServerQuiescingITest, TestQuiescingToolBasics) {
+ NO_FATALS(StartCluster(1));
+ const auto* ts = cluster_->mini_tablet_server(0);
+ auto rw_workload = CreateFaultIntolerantRWWorkload();
+ rw_workload->Setup();
+ ASSERT_FALSE(ts->server()->quiescing());
+ // First, call the start tool a couple of times.
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce start $0", ts->bound_rpc_addr().ToString())));
+ ASSERT_TRUE(ts->server()->quiescing());
+ }
+ ASSERT_OK(RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce stop $0", ts->bound_rpc_addr().ToString())));
+ ASSERT_FALSE(ts->server()->quiescing());
+
+ // Now try starting again but expecting errors.
+ Status s = RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce start $0 --error_if_not_fully_quiesced",
+ ts->bound_rpc_addr().ToString()));
+ ASSERT_FALSE(s.ok());
+ ASSERT_STR_CONTAINS(s.ToString(), "not fully quiesced");
+ ASSERT_TRUE(ts->server()->quiescing());
+}
+
+// Basic test to ensure the quiescing tooling works as expected.
+TEST_F(TServerQuiescingITest, TestQuiesceAndStopTool) {
+ const int kNumReplicas = 3;
+ // Set a tiny batch size to encourage many batches for a single scan. This
+ // will emulate long-running scans.
+ FLAGS_scanner_default_batch_size_bytes = 100;
+ NO_FATALS(StartCluster(kNumReplicas));
+ MiniTabletServer* leader_ts;
+ auto rw_workload = CreateFaultIntolerantRWWorkload();
+ rw_workload->set_scanner_selection(client::KuduClient::LEADER_ONLY);
+ rw_workload->Setup();
+ rw_workload->Start();
+ while (rw_workload->rows_inserted() < 10000) {
+ SleepFor(MonoDelta::FromMilliseconds(50));
+ }
+ // Pick a tablet server with a leader.
+ TServerDetails* leader_details;
+ const auto kTimeout = MonoDelta::FromSeconds(10);
+ const string tablet_id = cluster_->mini_tablet_server(0)->ListTablets()[0];
+ ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_details));
+ const string leader_uuid = leader_details->uuid();
+
+ // The tablet server should have some leaders, and will eventually serve some scans.
+ leader_ts = cluster_->mini_tablet_server_by_uuid(leader_uuid);
+ ASSERT_LT(0, leader_ts->server()->num_raft_leaders()->value());
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_LT(0, leader_ts->server()->scanner_manager()->CountActiveScanners());
+ });
+
+ // Now quiesce the server. At first, the tool should fail because there are
+ // still leaders on the server, though it should have successfully begun
+ // quiescing.
+ Status s = RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce start $0 --error_if_not_fully_quiesced",
+ leader_ts->bound_rpc_addr().ToString()));
+ ASSERT_FALSE(s.ok()) << s.ToString();
+ ASSERT_STR_CONTAINS(s.ToString(), "not fully quiesced");
+ ASSERT_TRUE(leader_ts->server()->quiescing());
+ // We must retry until the tool returns success, indicating the server is
+ // fully quiesced.
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce start $0 --error_if_not_fully_quiesced",
+ leader_ts->bound_rpc_addr().ToString())));
+ });
+
+ // The server should be quiesced fully.
+ ASSERT_EQ(0, leader_ts->server()->num_raft_leaders()->value());
+ ASSERT_EQ(0, leader_ts->server()->scanner_manager()->CountActiveScanners());
+
+ // The 'stop_quiescing' tool should yield a non-quiescing server.
+ ASSERT_OK(RunActionPrependStdoutStderr(
+ Substitute("tserver quiesce stop $0", leader_ts->bound_rpc_addr().ToString())));
+ ASSERT_FALSE(leader_ts->server()->quiescing());
+ NO_FATALS(rw_workload->StopAndJoin());
+}
+
class TServerQuiescingParamITest : public TServerQuiescingITest,
public testing::WithParamInterface<int> {};
@@ -394,8 +480,8 @@ TEST_P(TServerQuiescingParamITest, TestQuiescingServerRejectsElectionRequests) {
// Test that if all tservers are quiescing, there will be no leaders elected.
TEST_P(TServerQuiescingParamITest, TestNoElectionsForNewReplicas) {
// NOTE: this test will prevent leaders of our new tablets. In practice,
- // users should have tablet creation not wait to finish if there all tservers
- // are being quiesced.
+ // users should have tablet creation not wait to finish if all tservers are
+ // being quiesced.
FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader = false;
const int kNumReplicas = GetParam();
const int kNumTablets = 10;
diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc
index 3695b83..43d3797 100644
--- a/src/kudu/integration-tests/test_workload.cc
+++ b/src/kudu/integration-tests/test_workload.cc
@@ -72,6 +72,7 @@ TestWorkload::TestWorkload(MiniCluster* cluster)
write_timeout_millis_(20000),
fault_tolerant_(true),
verify_num_rows_(true),
+ read_errors_allowed_(false),
timeout_allowed_(false),
not_found_allowed_(false),
network_error_allowed_(false),
@@ -212,6 +213,19 @@ void TestWorkload::WriteThread() {
}
}
+#define CHECK_READ_OK(s) do { \
+ const Status& __s = (s); \
+ if (read_errors_allowed_) { \
+ if (PREDICT_FALSE(!__s.ok())) { \
+ std::lock_guard<simple_spinlock> l(read_error_lock_); \
+ read_errors_.emplace_back(__s); \
+ return; \
+ } \
+ } else { \
+ CHECK_OK(__s); \
+ } \
+} while (0)
+
void TestWorkload::ReadThread() {
shared_ptr<KuduTable> table;
OpenTable(&table);
@@ -235,10 +249,10 @@ void TestWorkload::ReadThread() {
}
size_t row_count = 0;
- CHECK_OK(scanner.Open());
+ CHECK_READ_OK(scanner.Open());
while (scanner.HasMoreRows()) {
KuduScanBatch batch;
- CHECK_OK(scanner.NextBatch(&batch));
+ CHECK_READ_OK(scanner.NextBatch(&batch));
row_count += batch.NumRows();
}
@@ -246,6 +260,8 @@ void TestWorkload::ReadThread() {
}
}
+#undef CHECK_READ_OK
+
size_t TestWorkload::GetNumberOfErrors(KuduSession* session) {
vector<KuduError*> errors;
ElementDeleter d(&errors);
diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h
index 5afa625..48892b7 100644
--- a/src/kudu/integration-tests/test_workload.h
+++ b/src/kudu/integration-tests/test_workload.h
@@ -14,11 +14,11 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-#ifndef KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H
-#define KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H
+#pragma once
#include <cstddef>
#include <cstdint>
+#include <mutex>
#include <ostream>
#include <string>
#include <thread>
@@ -33,13 +33,13 @@
#include "kudu/gutil/macros.h"
#include "kudu/util/atomic.h"
#include "kudu/util/countdown_latch.h"
+#include "kudu/util/locks.h"
#include "kudu/util/monotime.h"
#include "kudu/util/random.h"
+#include "kudu/util/status.h"
namespace kudu {
-class Status;
-
namespace cluster {
class MiniCluster;
} // namespace cluster
@@ -61,6 +61,13 @@ class TestWorkload {
explicit TestWorkload(cluster::MiniCluster* cluster);
~TestWorkload();
+ // Sets whether the read thread should crash if scanning to the cluster fails
+ // for whatever reason. If set to true, errors will be populated in
+ // 'read_errors_'.
+ void set_read_errors_allowed(bool allowed) {
+ read_errors_allowed_ = allowed;
+ }
+
void set_scanner_fault_tolerant(bool fault_tolerant) {
fault_tolerant_ = fault_tolerant;
}
@@ -239,6 +246,12 @@ class TestWorkload {
return batches_completed_.Load();
}
+ // Returns a copy of the errors seen by the read threads so far.
+ std::vector<Status> read_errors() const {
+ std::lock_guard<simple_spinlock> l(read_error_lock_);
+ return read_errors_;
+ }
+
client::sp::shared_ptr<client::KuduClient> client() const { return client_; }
private:
@@ -261,6 +274,7 @@ class TestWorkload {
int write_timeout_millis_;
bool fault_tolerant_;
bool verify_num_rows_;
+ bool read_errors_allowed_;
bool timeout_allowed_;
bool not_found_allowed_;
bool already_present_allowed_;
@@ -283,8 +297,10 @@ class TestWorkload {
std::vector<std::thread> threads_;
+ mutable simple_spinlock read_error_lock_;
+ std::vector<Status> read_errors_;
+
DISALLOW_COPY_AND_ASSIGN(TestWorkload);
};
} // namespace kudu
-#endif /* KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H */
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index fd8b66c..dc81b6a 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -1165,6 +1165,7 @@ TEST_F(ToolTest, TestModeHelp) {
"set_flag.*Change a gflag value",
"state.*Operate on the state",
"status.*Get the status",
+ "quiesce.*Operate on the quiescing state",
"timestamp.*Get the current timestamp",
"list.*List tablet servers"
};
@@ -1178,6 +1179,13 @@ TEST_F(ToolTest, TestModeHelp) {
NO_FATALS(RunTestHelp("tserver state", kTServerSetStateModeRegexes));
}
{
+ const vector<string> kTServerQuiesceModeRegexes = {
+ "start.*Start quiescing the given Tablet Server",
+ "stop.*Stop quiescing a Tablet Server",
+ };
+ NO_FATALS(RunTestHelp("tserver quiesce", kTServerQuiesceModeRegexes));
+ }
+ {
const vector<string> kWalModeRegexes = {
"dump.*Dump a WAL",
};
diff --git a/src/kudu/tools/tool_action_tserver.cc b/src/kudu/tools/tool_action_tserver.cc
index a89a566..f2875d4 100644
--- a/src/kudu/tools/tool_action_tserver.cc
+++ b/src/kudu/tools/tool_action_tserver.cc
@@ -35,10 +35,14 @@
#include "kudu/gutil/strings/substitute.h"
#include "kudu/master/master.pb.h"
#include "kudu/master/master.proxy.h"
+#include "kudu/rpc/rpc_controller.h"
#include "kudu/tools/tool_action.h"
#include "kudu/tools/tool_action_common.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/tablet_server_runner.h"
+#include "kudu/tserver/tserver.pb.h"
+#include "kudu/tserver/tserver_admin.pb.h"
+#include "kudu/tserver/tserver_admin.proxy.h"
#include "kudu/util/init.h"
#include "kudu/util/status.h"
@@ -46,12 +50,17 @@ DEFINE_bool(allow_missing_tserver, false, "If true, performs the action on the "
"tserver even if it has not been registered with the master and has no "
"existing tserver state records associated with it.");
+DEFINE_bool(error_if_not_fully_quiesced, false, "If true, the command to start "
+ "quiescing will return an error if the tserver is not fully quiesced, i.e. "
+ "there are still tablet leaders or active scanners on it.");
+
DECLARE_string(columns);
using std::cout;
using std::string;
using std::unique_ptr;
using std::vector;
+using strings::Substitute;
namespace kudu {
@@ -61,7 +70,11 @@ using master::ListTabletServersRequestPB;
using master::ListTabletServersResponsePB;
using master::MasterServiceProxy;
using master::TServerStateChangePB;
+using rpc::RpcController;
+using tserver::QuiesceTabletServerRequestPB;
+using tserver::QuiesceTabletServerResponsePB;
using tserver::TabletServer;
+using tserver::TabletServerAdminServiceProxy;
namespace tools {
namespace {
@@ -134,7 +147,7 @@ Status ListTServers(const RunnerContext& context) {
const auto& servers = resp.servers();
auto hostport_to_string = [](const HostPortPB& hostport) {
- return strings::Substitute("$0:$1", hostport.host(), hostport.port());
+ return Substitute("$0:$1", hostport.host(), hostport.port());
};
for (const auto& column : cols) {
@@ -165,7 +178,7 @@ Status ListTServers(const RunnerContext& context) {
}
} else if (boost::iequals(column, "heartbeat")) {
for (const auto& server : servers) {
- values.emplace_back(strings::Substitute("$0ms", server.millis_since_heartbeat()));
+ values.emplace_back(Substitute("$0ms", server.millis_since_heartbeat()));
}
} else if (boost::iequals(column, "location")) {
for (const auto& server : servers) {
@@ -223,6 +236,48 @@ Status ExitMaintenance(const RunnerContext& context) {
return TServerSetState(context, TServerStateChangePB::EXIT_MAINTENANCE_MODE);
}
+Status StartQuiescingTServer(const RunnerContext& context) {
+ const auto& address = FindOrDie(context.required_args, kTServerAddressArg);
+ unique_ptr<TabletServerAdminServiceProxy> proxy;
+ RETURN_NOT_OK(BuildProxy(address, tserver::TabletServer::kDefaultPort, &proxy));
+
+ QuiesceTabletServerRequestPB req;
+ req.set_quiesce(true);
+ req.set_return_stats(FLAGS_error_if_not_fully_quiesced);
+ RpcController rpc;
+ QuiesceTabletServerResponsePB resp;
+ RETURN_NOT_OK(proxy->Quiesce(req, &resp, &rpc));
+ if (resp.has_error()) {
+ return StatusFromPB(resp.error().status());
+ }
+ if (FLAGS_error_if_not_fully_quiesced &&
+ (resp.num_leaders() != 0 || resp.num_active_scanners() != 0)) {
+ return Status::Incomplete(
+ Substitute("Tablet server not fully quiesced: $0 tablet leaders and $1 active "
+ "scanners remain", resp.num_leaders(), resp.num_active_scanners()));
+ }
+ return Status::OK();
+}
+
+Status StopQuiescingTServer(const RunnerContext& context) {
+ const auto& address = FindOrDie(context.required_args, kTServerAddressArg);
+ unique_ptr<TabletServerAdminServiceProxy> proxy;
+ RETURN_NOT_OK(BuildProxy(address, tserver::TabletServer::kDefaultPort, &proxy));
+
+ QuiesceTabletServerRequestPB req;
+ req.set_quiesce(false);
+ req.set_return_stats(false);
+ QuiesceTabletServerResponsePB resp;
+ RpcController rpc;
+
+ RETURN_NOT_OK(proxy->Quiesce(req, &resp, &rpc));
+ if (resp.has_error()) {
+ return StatusFromPB(resp.error().status());
+ }
+
+ return Status::OK();
+}
+
} // anonymous namespace
unique_ptr<Mode> BuildTServerMode() {
@@ -301,6 +356,26 @@ unique_ptr<Mode> BuildTServerMode() {
.AddOptionalParameter("timeout_ms")
.Build();
+ unique_ptr<Action> start_quiescing =
+ ActionBuilder("start", &StartQuiescingTServer)
+ .Description("Start quiescing the given Tablet Server. While a Tablet "
+ "Server is quiescing, Tablet replicas on it will no longer "
+ "attempt to become leader, and new scan requests will be "
+ "retried at other servers.")
+ .AddRequiredParameter({ kTServerAddressArg, kTServerAddressDesc })
+ .AddOptionalParameter("error_if_not_fully_quiesced")
+ .Build();
+ unique_ptr<Action> stop_quiescing =
+ ActionBuilder("stop", &StopQuiescingTServer)
+ .Description("Stop quiescing a Tablet Server.")
+ .AddRequiredParameter({ kTServerAddressArg, kTServerAddressDesc })
+ .Build();
+ unique_ptr<Mode> quiesce = ModeBuilder("quiesce")
+ .Description("Operate on the quiescing state of a Kudu Tablet Server.")
+ .AddAction(std::move(start_quiescing))
+ .AddAction(std::move(stop_quiescing))
+ .Build();
+
unique_ptr<Action> enter_maintenance =
ActionBuilder("enter_maintenance", &EnterMaintenance)
.Description("Begin maintenance on the Tablet Server. While under "
@@ -334,6 +409,7 @@ unique_ptr<Mode> BuildTServerMode() {
.AddAction(std::move(status))
.AddAction(std::move(timestamp))
.AddAction(std::move(list_tservers))
+ .AddMode(std::move(quiesce))
.AddMode(std::move(state))
.Build();
}
diff --git a/src/kudu/tools/tool_test_util.cc b/src/kudu/tools/tool_test_util.cc
index 8f7318f..a0cbbcb 100644
--- a/src/kudu/tools/tool_test_util.cc
+++ b/src/kudu/tools/tool_test_util.cc
@@ -19,11 +19,14 @@
#include "kudu/tools/tool_test_util.h"
+#include <cstdio>
#include <ostream>
#include <vector>
#include <glog/logging.h>
+#include "kudu/gutil/strings/split.h"
+#include "kudu/gutil/strings/substitute.h"
#include "kudu/util/env.h"
#include "kudu/util/path_util.h"
#include "kudu/util/status.h"
@@ -31,6 +34,8 @@
using std::string;
using std::vector;
+using strings::Split;
+using strings::Substitute;
namespace kudu {
namespace tools {
@@ -64,5 +69,14 @@ Status RunKuduTool(const vector<string>& args, string* out, string* err,
return Subprocess::Call(total_args, in, out, err);
}
+Status RunActionPrependStdoutStderr(const string& arg_str) {
+ string stdout;
+ string stderr;
+ RETURN_NOT_OK_PREPEND(RunKuduTool(Split(arg_str, " ", strings::SkipEmpty()),
+ &stdout, &stderr),
+ Substitute("error running '$0': stdout: $1, stderr: $2", arg_str, stdout, stderr));
+ return Status::OK();
+}
+
} // namespace tools
} // namespace kudu
diff --git a/src/kudu/tools/tool_test_util.h b/src/kudu/tools/tool_test_util.h
index 1d7eae7..b511c60 100644
--- a/src/kudu/tools/tool_test_util.h
+++ b/src/kudu/tools/tool_test_util.h
@@ -44,5 +44,9 @@ Status RunKuduTool(const std::vector<std::string>& args,
std::string* err = nullptr,
const std::string& in = "");
+// Runs the 'kudu' tool binary with the given argument string, returning an
+// error prepended with stdout and stderr if the run was unsuccessful.
+Status RunActionPrependStdoutStderr(const std::string& arg_str);
+
} // namespace tools
} // namespace kudu
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index bde51f5..48b5543 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -18,6 +18,7 @@
#include "kudu/tserver/tablet_service.h"
#include <algorithm>
+#include <atomic>
#include <cstdint>
#include <cstring>
#include <functional>
@@ -26,6 +27,7 @@
#include <ostream>
#include <string>
#include <unordered_set>
+#include <utility>
#include <vector>
#include <boost/optional/optional.hpp>
@@ -53,6 +55,7 @@
#include "kudu/consensus/raft_consensus.h"
#include "kudu/consensus/replica_management.pb.h"
#include "kudu/consensus/time_manager.h"
+#include "kudu/fs/fs_manager.h"
#include "kudu/gutil/basictypes.h"
#include "kudu/gutil/casts.h"
#include "kudu/gutil/gscoped_ptr.h"
@@ -89,6 +92,7 @@
#include "kudu/tserver/tserver_admin.pb.h"
#include "kudu/tserver/tserver_service.pb.h"
#include "kudu/util/auto_release_pool.h"
+#include "kudu/util/bitset.h"
#include "kudu/util/crc.h"
#include "kudu/util/debug/trace_event.h"
#include "kudu/util/faststring.h"
@@ -1009,6 +1013,25 @@ void TabletServiceAdminImpl::AlterSchema(const AlterSchemaRequestPB* req,
}
}
+void TabletServiceAdminImpl::Quiesce(const QuiesceTabletServerRequestPB* req,
+ QuiesceTabletServerResponsePB* resp,
+ rpc::RpcContext* context) {
+ if (req->has_quiesce()) {
+ bool quiesce_tserver = req->quiesce();
+ *server_->mutable_quiescing() = quiesce_tserver;
+ LOG(INFO) << Substitute("Tablet server $0 set to $1",
+ server_->fs_manager()->uuid(),
+ (quiesce_tserver ? "quiescing" : "not quiescing"));
+ }
+ if (req->return_stats()) {
+ resp->set_num_leaders(server_->num_raft_leaders()->value());
+ resp->set_num_active_scanners(server_->scanner_manager()->CountActiveScanners());
+ LOG(INFO) << Substitute("Tablet server has $0 leaders and $1 scanners",
+ resp->num_leaders(), resp->num_active_scanners());
+ }
+ context->RespondSuccess();
+}
+
void TabletServiceAdminImpl::CreateTablet(const CreateTabletRequestPB* req,
CreateTabletResponsePB* resp,
rpc::RpcContext* context) {
diff --git a/src/kudu/tserver/tablet_service.h b/src/kudu/tserver/tablet_service.h
index 143b9ad..a82e09a 100644
--- a/src/kudu/tserver/tablet_service.h
+++ b/src/kudu/tserver/tablet_service.h
@@ -92,6 +92,8 @@ class CreateTabletRequestPB;
class CreateTabletResponsePB;
class DeleteTabletRequestPB;
class DeleteTabletResponsePB;
+class QuiesceTabletServerRequestPB;
+class QuiesceTabletServerResponsePB;
class ScanResultCollector;
class TabletReplicaLookupIf;
class TabletServer;
@@ -117,32 +119,32 @@ class TabletServiceImpl : public TabletServerServiceIf {
google::protobuf::Message* resp,
rpc::RpcContext* context) override;
- virtual void Ping(const PingRequestPB* req,
- PingResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void Ping(const PingRequestPB* req,
+ PingResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void Write(const WriteRequestPB* req, WriteResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void Write(const WriteRequestPB* req, WriteResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void Scan(const ScanRequestPB* req,
- ScanResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void Scan(const ScanRequestPB* req,
+ ScanResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void ScannerKeepAlive(const ScannerKeepAliveRequestPB *req,
- ScannerKeepAliveResponsePB *resp,
- rpc::RpcContext *context) OVERRIDE;
+ void ScannerKeepAlive(const ScannerKeepAliveRequestPB *req,
+ ScannerKeepAliveResponsePB *resp,
+ rpc::RpcContext *context) override;
- virtual void ListTablets(const ListTabletsRequestPB* req,
- ListTabletsResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void ListTablets(const ListTabletsRequestPB* req,
+ ListTabletsResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void SplitKeyRange(const SplitKeyRangeRequestPB* req,
- SplitKeyRangeResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void SplitKeyRange(const SplitKeyRangeRequestPB* req,
+ SplitKeyRangeResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void Checksum(const ChecksumRequestPB* req,
- ChecksumResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void Checksum(const ChecksumRequestPB* req,
+ ChecksumResponsePB* resp,
+ rpc::RpcContext* context) override;
bool SupportsFeature(uint32_t feature) const override;
@@ -199,17 +201,21 @@ class TabletServiceAdminImpl : public TabletServerAdminServiceIf {
google::protobuf::Message* resp,
rpc::RpcContext* context) override;
- virtual void CreateTablet(const CreateTabletRequestPB* req,
- CreateTabletResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void CreateTablet(const CreateTabletRequestPB* req,
+ CreateTabletResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void DeleteTablet(const DeleteTabletRequestPB* req,
- DeleteTabletResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void DeleteTablet(const DeleteTabletRequestPB* req,
+ DeleteTabletResponsePB* resp,
+ rpc::RpcContext* context) override;
- virtual void AlterSchema(const AlterSchemaRequestPB* req,
- AlterSchemaResponsePB* resp,
- rpc::RpcContext* context) OVERRIDE;
+ void AlterSchema(const AlterSchemaRequestPB* req,
+ AlterSchemaResponsePB* resp,
+ rpc::RpcContext* context) override;
+
+ void Quiesce(const QuiesceTabletServerRequestPB* req,
+ QuiesceTabletServerResponsePB* resp,
+ rpc::RpcContext* context) override;
private:
TabletServer* server_;
diff --git a/src/kudu/tserver/tserver_admin.proto b/src/kudu/tserver/tserver_admin.proto
index e68c7f4..e61db11 100644
--- a/src/kudu/tserver/tserver_admin.proto
+++ b/src/kudu/tserver/tserver_admin.proto
@@ -144,4 +144,30 @@ service TabletServerAdminService {
// Alter a tablet's schema.
rpc AlterSchema(AlterSchemaRequestPB) returns (AlterSchemaResponsePB);
+
+ // Quiesce the tablet server.
+ rpc Quiesce(QuiesceTabletServerRequestPB) returns (QuiesceTabletServerResponsePB);
+}
+
+message QuiesceTabletServerRequestPB {
+ // Indicates whether the request is to start quiescing or to stop quiescing.
+ // If not set, the tserver's quiescing state will not be changed.
+ optional bool quiesce = 1;
+
+ // Indicates whether to return the number of tablet leaders and active
+ // scanners.
+ optional bool return_stats = 2;
+}
+
+message QuiesceTabletServerResponsePB {
+ // The error, if an error occurred with this request.
+ optional TabletServerErrorPB error = 1;
+
+ // The number of active scanners on the given tablet server. Only returned if
+ // stats were requested.
+ optional int32 num_active_scanners = 2;
+
+ // The number of tablet leaders hosted on the given tablet server. Only
+ // returned if stats were requested.
+ optional int32 num_leaders = 3;
}