You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/07/22 22:02:22 UTC

[2/9] incubator-kudu git commit: KUDU-1516 ksck should check for more raft-related status issues (partial)

KUDU-1516 ksck should check for more raft-related status issues (partial)

This patch improves ksck. The main way it does so is by adding "tablet
server POV" information. ksck now gathers information about tablet
replicas from the tablet servers and cross-references this information
with the master metadata. This adds the following checks:

* each tablet has a majority of replicas on live tablet servers
* if a tablet has a majority of replicas on a live tablet
  server, then a majority of its tablets are in RUNNING state
* the assignments of tablets to tablet servers in the master agrees with
  the assignment of tablet replicas reported by the tablet servers

This patch does not include other desiderata from KUDU-1516, like a consensus
canary or a write op canary.

The code is also restructured quite a bit, so that all of the "fetch
information from tablet servers" work happens up front in a single call. This
paves the way a bit for a future enhancement in which all of these RPCs are
done on a thread-pool (since it can be somewhat slow for large clusters).

To try to improve performance for clusters with a lot of data, I also added a
flag to the ListTablets RPC so that the response does not include schema
information, which is both large and irrelevant for this use case.

An example of the new output against a cluster with some dead tablet servers
and broken tablets is available at:
https://gist.github.com/toddlipcon/7ae677214988d064627bf1325f04dfac

This patch is based on some earlier work by Will Berkeley.

Change-Id: Iec6590ba52548a9ee11d63269b134320b10809da
Reviewed-on: http://gerrit.cloudera.org:8080/3632
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/513d6e9f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/513d6e9f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/513d6e9f

Branch: refs/heads/master
Commit: 513d6e9f5d42242edb3c3f40f0c5c968873160ea
Parents: c0b4f50
Author: Todd Lipcon <to...@apache.org>
Authored: Tue Jul 19 18:37:47 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Jul 22 20:22:43 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/cluster_verifier.cc |   6 +-
 src/kudu/master/master.proto                   |   2 +-
 src/kudu/tools/CMakeLists.txt                  |   1 +
 src/kudu/tools/ksck-test.cc                    | 209 +++++++++++++++-----
 src/kudu/tools/ksck.cc                         | 147 ++++++++++----
 src/kudu/tools/ksck.h                          |  81 ++++++--
 src/kudu/tools/ksck_remote-test.cc             |   7 +-
 src/kudu/tools/ksck_remote.cc                  |  52 +++--
 src/kudu/tools/ksck_remote.h                   |   5 +-
 src/kudu/tools/kudu-ksck.cc                    |   4 +-
 src/kudu/tserver/tablet_service.cc             |   9 +-
 src/kudu/tserver/tserver.proto                 |   9 +-
 12 files changed, 402 insertions(+), 130 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/integration-tests/cluster_verifier.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_verifier.cc b/src/kudu/integration-tests/cluster_verifier.cc
index e876875..64d406a 100644
--- a/src/kudu/integration-tests/cluster_verifier.cc
+++ b/src/kudu/integration-tests/cluster_verifier.cc
@@ -85,10 +85,14 @@ Status ClusterVerifier::DoKsck() {
   std::shared_ptr<KsckCluster> cluster(new KsckCluster(master));
   std::shared_ptr<Ksck> ksck(new Ksck(cluster));
 
+  // Some unit tests create or remove replicas of tablets, which
+  // we shouldn't consider fatal.
+  ksck->set_check_replica_count(false);
+
   // This is required for everything below.
   RETURN_NOT_OK(ksck->CheckMasterRunning());
   RETURN_NOT_OK(ksck->FetchTableAndTabletInfo());
-  RETURN_NOT_OK(ksck->CheckTabletServersRunning());
+  RETURN_NOT_OK(ksck->FetchInfoFromTabletServers());
   RETURN_NOT_OK(ksck->CheckTablesConsistency());
 
   vector<string> tables;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/master/master.proto
----------------------------------------------------------------------
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 35a3354..402ab2b 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -112,7 +112,7 @@ message SysTabletsEntryPB {
   // Tablet partition.
   optional PartitionPB partition = 7;
 
-  // The latest committed consensus configuration consensus configuration reported to the Master.
+  // The latest committed consensus configuration reported to the Master.
   optional consensus.ConsensusStatePB committed_consensus_state = 3;
 
   // Debug state for the tablet.

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/tools/CMakeLists.txt b/src/kudu/tools/CMakeLists.txt
index 1a6a570..ab7c0c5 100644
--- a/src/kudu/tools/CMakeLists.txt
+++ b/src/kudu/tools/CMakeLists.txt
@@ -74,6 +74,7 @@ add_library(ksck
     ksck_remote.cc
 )
 target_link_libraries(ksck
+  consensus
   master_proto
   server_base_proto
   tserver_proto

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index f64ff94..a9184ab 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -22,6 +22,7 @@
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/tools/ksck.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/test_util.h"
 
 namespace kudu {
@@ -31,18 +32,28 @@ using std::shared_ptr;
 using std::static_pointer_cast;
 using std::string;
 using std::unordered_map;
-using std::vector;
+using strings::Substitute;
+
+// Import this symbol from ksck.cc so we can introspect the
+// errors being written to stderr.
+extern std::ostream* g_err_stream;
 
 class MockKsckTabletServer : public KsckTabletServer {
  public:
   explicit MockKsckTabletServer(const string& uuid)
       : KsckTabletServer(uuid),
-        connect_status_(Status::OK()),
+        fetch_info_status_(Status::OK()),
         address_("<mock>") {
   }
 
-  virtual Status Connect() const OVERRIDE {
-    return connect_status_;
+  Status FetchInfo() override {
+    timestamp_ = 0;
+    if (fetch_info_status_.ok()) {
+      state_ = kFetched;
+    } else {
+      state_ = kFetchFailed;
+    }
+    return fetch_info_status_;
   }
 
   virtual void RunTabletChecksumScanAsync(
@@ -53,17 +64,12 @@ class MockKsckTabletServer : public KsckTabletServer {
     callback.Run(Status::OK(), 0);
   }
 
-  virtual Status CurrentTimestamp(uint64_t* timestamp) const OVERRIDE {
-    *timestamp = 0;
-    return Status::OK();
-  }
-
   virtual const std::string& address() const OVERRIDE {
     return address_;
   }
 
   // Public because the unit tests mutate this variable directly.
-  Status connect_status_;
+  Status fetch_info_status_;
 
  private:
   const string address_;
@@ -72,11 +78,11 @@ class MockKsckTabletServer : public KsckTabletServer {
 class MockKsckMaster : public KsckMaster {
  public:
   MockKsckMaster()
-      : connect_status_(Status::OK()) {
+      : fetch_info_status_(Status::OK()) {
   }
 
   virtual Status Connect() const OVERRIDE {
-    return connect_status_;
+    return fetch_info_status_;
   }
 
   virtual Status RetrieveTabletServers(TSMap* tablet_servers) OVERRIDE {
@@ -94,7 +100,7 @@ class MockKsckMaster : public KsckMaster {
   }
 
   // Public because the unit tests mutate these variables directly.
-  Status connect_status_;
+  Status fetch_info_status_;
   TSMap tablet_servers_;
   vector<shared_ptr<KsckTable>> tables_;
 };
@@ -107,11 +113,17 @@ class KsckTest : public KuduTest {
         ksck_(new Ksck(cluster_)) {
     unordered_map<string, shared_ptr<KsckTabletServer>> tablet_servers;
     for (int i = 0; i < 3; i++) {
-      string name = strings::Substitute("$0", i);
+      string name = Substitute("ts-id-$0", i);
       shared_ptr<MockKsckTabletServer> ts(new MockKsckTabletServer(name));
       InsertOrDie(&tablet_servers, ts->uuid(), ts);
     }
     master_->tablet_servers_.swap(tablet_servers);
+
+    g_err_stream = &err_stream_;
+  }
+
+  ~KsckTest() {
+    g_err_stream = NULL;
   }
 
  protected:
@@ -128,8 +140,8 @@ class KsckTest : public KuduTest {
     CreateDefaultAssignmentPlan(1);
 
     auto table = CreateAndAddTable("test", 1);
-    shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), "1"));
-    CreateAndFillTablet(tablet, 1, true);
+    shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), "tablet-id-1"));
+    CreateAndFillTablet(tablet, 1, true, true);
     table->set_tablets({ tablet });
   }
 
@@ -141,8 +153,25 @@ class KsckTest : public KuduTest {
 
     vector<shared_ptr<KsckTablet>> tablets;
     for (int i = 0; i < num_tablets; i++) {
-      shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), std::to_string(i)));
-      CreateAndFillTablet(tablet, num_replicas, true);
+      shared_ptr<KsckTablet> tablet(new KsckTablet(
+          table.get(), Substitute("tablet-id-$0", i)));
+      CreateAndFillTablet(tablet, num_replicas, true, true);
+      tablets.push_back(tablet);
+    }
+    table->set_tablets(tablets);
+  }
+
+  void CreateOneSmallReplicatedTableWithTabletNotRunning() {
+    int num_replicas = 3;
+    int num_tablets = 3;
+    CreateDefaultAssignmentPlan(num_replicas * num_tablets);
+    auto table = CreateAndAddTable("test", num_replicas);
+
+    vector<shared_ptr<KsckTablet>> tablets;
+    for (int i = 0; i < num_tablets; i++) {
+      shared_ptr<KsckTablet> tablet(new KsckTablet(
+          table.get(), Substitute("tablet-id-$0", i)));
+      CreateAndFillTablet(tablet, num_replicas, true, i != 0);
       tablets.push_back(tablet);
     }
     table->set_tablets(tablets);
@@ -154,10 +183,9 @@ class KsckTest : public KuduTest {
 
     auto table = CreateAndAddTable("test", 3);
 
-    shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), "1"));
-    CreateAndFillTablet(tablet, 2, false);
+    shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), "tablet-id-1"));
+    CreateAndFillTablet(tablet, 2, false, true);
     table->set_tablets({ tablet });
-
   }
 
   shared_ptr<KsckTable> CreateAndAddTable(const string& name, int num_replicas) {
@@ -167,25 +195,51 @@ class KsckTest : public KuduTest {
     return table;
   }
 
-  void CreateAndFillTablet(shared_ptr<KsckTablet>& tablet, int num_replicas, bool has_leader) {
+  void CreateAndFillTablet(shared_ptr<KsckTablet>& tablet, int num_replicas,
+                           bool has_leader, bool is_running) {
     vector<shared_ptr<KsckTabletReplica>> replicas;
     if (has_leader) {
-      CreateReplicaAndAdd(replicas, true);
+      CreateReplicaAndAdd(replicas, tablet->id(), true, is_running);
       num_replicas--;
     }
     for (int i = 0; i < num_replicas; i++) {
-      CreateReplicaAndAdd(replicas, false);
+      CreateReplicaAndAdd(replicas, tablet->id(), false, is_running);
     }
     tablet->set_replicas(replicas);
   }
 
-  void CreateReplicaAndAdd(vector<shared_ptr<KsckTabletReplica>>& replicas, bool is_leader) {
+  void CreateReplicaAndAdd(vector<shared_ptr<KsckTabletReplica>>& replicas,
+                           string tablet_id,
+                           bool is_leader,
+                           bool is_running) {
     shared_ptr<KsckTabletReplica> replica(new KsckTabletReplica(assignment_plan_.back(),
                                                                 is_leader, !is_leader));
+    shared_ptr<MockKsckTabletServer> ts = static_pointer_cast<MockKsckTabletServer>(
+            master_->tablet_servers_.at(assignment_plan_.back()));
+
     assignment_plan_.pop_back();
     replicas.push_back(replica);
+
+    // Add the equivalent replica on the tablet server.
+    tablet::TabletStatusPB pb;
+    pb.set_tablet_id(tablet_id);
+    pb.set_table_name("fake-table");
+    pb.set_state(is_running ? tablet::RUNNING : tablet::FAILED);
+    InsertOrDie(&ts->tablet_status_map_, tablet_id, std::move(pb));
+  }
+
+  Status RunKsck() {
+    auto c = MakeScopedCleanup([this]() {
+        LOG(INFO) << "Ksck output:\n" << err_stream_.str();
+      });
+    RETURN_NOT_OK(ksck_->CheckMasterRunning());
+    RETURN_NOT_OK(ksck_->FetchTableAndTabletInfo());
+    RETURN_NOT_OK(ksck_->FetchInfoFromTabletServers());
+    RETURN_NOT_OK(ksck_->CheckTablesConsistency());
+    return Status::OK();
   }
 
+
   shared_ptr<MockKsckMaster> master_;
   shared_ptr<KsckCluster> cluster_;
   shared_ptr<Ksck> ksck_;
@@ -195,6 +249,8 @@ class KsckTest : public KuduTest {
   // you should have a list that looks like ts1,ts2,ts3,ts3,ts2,ts1 so that the two LEADERS, which
   // are assigned first, end up on ts1 and ts3.
   vector<string> assignment_plan_;
+
+  std::stringstream err_stream_;
 };
 
 TEST_F(KsckTest, TestMasterOk) {
@@ -203,55 +259,110 @@ TEST_F(KsckTest, TestMasterOk) {
 
 TEST_F(KsckTest, TestMasterUnavailable) {
   Status error = Status::NetworkError("Network failure");
-  master_->connect_status_ = error;
+  master_->fetch_info_status_ = error;
   ASSERT_TRUE(ksck_->CheckMasterRunning().IsNetworkError());
 }
 
 TEST_F(KsckTest, TestTabletServersOk) {
-  ASSERT_OK(ksck_->CheckMasterRunning());
-  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
+  ASSERT_OK(RunKsck());
 }
 
 TEST_F(KsckTest, TestBadTabletServer) {
-  ASSERT_OK(ksck_->CheckMasterRunning());
+  CreateOneSmallReplicatedTable();
+
+  // Mock a failure to connect to one of the tablet servers.
   Status error = Status::NetworkError("Network failure");
-  static_pointer_cast<MockKsckTabletServer>(master_->tablet_servers_.begin()->second)
-      ->connect_status_ = error;
+  static_pointer_cast<MockKsckTabletServer>(master_->tablet_servers_["ts-id-1"])
+      ->fetch_info_status_ = error;
+
+  ASSERT_OK(ksck_->CheckMasterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  Status s = ksck_->CheckTabletServersRunning();
+  Status s = ksck_->FetchInfoFromTabletServers();
   ASSERT_TRUE(s.IsNetworkError()) << "Status returned: " << s.ToString();
+
+  s = ksck_->CheckTablesConsistency();
+  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_STR_CONTAINS(
+      err_stream_.str(),
+      "WARNING: Unable to connect to Tablet Server "
+      "ts-id-1 (<mock>): Network error: Network failure");
+  ASSERT_STR_CONTAINS(
+      err_stream_.str(),
+      "WARNING: Detected problems with Tablet tablet-id-0 of table 'test'\n"
+      "------------------------------------------------------------\n"
+      "WARNING: Should have a replica on TS ts-id-1 (<mock>), but TS is unavailable\n\n");
+  ASSERT_STR_CONTAINS(
+      err_stream_.str(),
+      "WARNING: Detected problems with Tablet tablet-id-1 of table 'test'\n"
+      "------------------------------------------------------------\n"
+      "WARNING: Should have a replica on TS ts-id-1 (<mock>), but TS is unavailable\n\n");
+  ASSERT_STR_CONTAINS(
+      err_stream_.str(),
+      "WARNING: Detected problems with Tablet tablet-id-2 of table 'test'\n"
+      "------------------------------------------------------------\n"
+      "WARNING: Should have a replica on TS ts-id-1 (<mock>), but TS is unavailable\n\n");
+}
+
+TEST_F(KsckTest, TestZeroTabletReplicasCheck) {
+  ASSERT_OK(RunKsck());
 }
 
 TEST_F(KsckTest, TestZeroTableCheck) {
-  ASSERT_OK(ksck_->CheckMasterRunning());
-  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
-  ASSERT_OK(ksck_->CheckTablesConsistency());
+  ASSERT_OK(RunKsck());
 }
 
 TEST_F(KsckTest, TestOneTableCheck) {
   CreateOneTableOneTablet();
-  ASSERT_OK(ksck_->CheckMasterRunning());
-  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
-  ASSERT_OK(ksck_->CheckTablesConsistency());
+  ASSERT_OK(RunKsck());
 }
 
 TEST_F(KsckTest, TestOneSmallReplicatedTable) {
   CreateOneSmallReplicatedTable();
-  ASSERT_OK(ksck_->CheckMasterRunning());
-  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
-  ASSERT_OK(ksck_->CheckTablesConsistency());
+  ASSERT_OK(RunKsck());
 }
 
 TEST_F(KsckTest, TestOneOneTabletBrokenTable) {
   CreateOneOneTabletReplicatedBrokenTable();
-  ASSERT_OK(ksck_->CheckMasterRunning());
-  ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
-  ASSERT_TRUE(ksck_->CheckTablesConsistency().IsCorruption());
+  Status s = RunKsck();
+  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+                      "Tablet tablet-id-1 of table 'test' has 2 instead of 3 replicas");
+}
+
+TEST_F(KsckTest, TestMismatchedAssignments) {
+  CreateOneSmallReplicatedTable();
+  shared_ptr<MockKsckTabletServer> ts = static_pointer_cast<MockKsckTabletServer>(
+      master_->tablet_servers_.at(Substitute("ts-id-$0", 0)));
+  ts->tablet_status_map_.erase(ts->tablet_status_map_.begin()->first);
+
+  Status s = RunKsck();
+  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_STR_CONTAINS(err_stream_.str(),
+                      "WARNING: Detected problems with Tablet tablet-id-2 of table 'test'\n"
+                      "------------------------------------------------------------\n"
+                      "WARNING: Missing a tablet replica on tablet server ts-id-0 (<mock>)\n");
+}
+
+TEST_F(KsckTest, TestTabletNotRunning) {
+  CreateOneSmallReplicatedTableWithTabletNotRunning();
+
+  Status s = RunKsck();
+  EXPECT_EQ("Corruption: 1 table(s) are bad", s.ToString());
+  ASSERT_STR_CONTAINS(
+      err_stream_.str(),
+      "WARNING: Detected problems with Tablet tablet-id-0 of table 'test'\n"
+      "------------------------------------------------------------\n"
+      "WARNING: Bad state on TS ts-id-0 (<mock>): FAILED\n"
+      "  Last status: \n"
+      "  Data state:  TABLET_DATA_UNKNOWN\n"
+      "WARNING: Bad state on TS ts-id-1 (<mock>): FAILED\n"
+      "  Last status: \n"
+      "  Data state:  TABLET_DATA_UNKNOWN\n"
+      "WARNING: Bad state on TS ts-id-2 (<mock>): FAILED\n"
+      "  Last status: \n"
+      "  Data state:  TABLET_DATA_UNKNOWN\n"
+      "ERROR: Tablet tablet-id-0 of table 'test' does not have a majority of "
+      "replicas in RUNNING state\n");
 }
 
 } // namespace tools

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 481141c..cb95f3a 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -22,6 +22,7 @@
 #include <mutex>
 #include <unordered_set>
 
+#include "kudu/consensus/quorum_util.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/join.h"
@@ -52,22 +53,26 @@ DEFINE_uint64(checksum_snapshot_timestamp, ChecksumOptions::kCurrentTimestamp,
               "timestamp to use for snapshot checksum scans, defaults to 0, which "
               "uses the current timestamp of a tablet server involved in the scan");
 
+// The stream to write output to. If this is NULL, defaults to cerr.
+// This is used by tests to capture output.
+ostream* g_err_stream = NULL;
+
 // Print an informational message to cerr.
+static ostream& Out() {
+  return (g_err_stream ? *g_err_stream : cerr);
+}
 static ostream& Info() {
-  cerr << "INFO: ";
-  return cerr;
+  return Out() << "INFO: ";
 }
 
 // Print a warning message to cerr.
 static ostream& Warn() {
-  cerr << "WARNING: ";
-  return cerr;
+  return Out() << "WARNING: ";
 }
 
 // Print an error message to cerr.
 static ostream& Error() {
-  cerr << "ERROR: ";
-  return cerr;
+  return Out() << "ERROR: ";
 }
 
 ChecksumOptions::ChecksumOptions()
@@ -86,6 +91,14 @@ ChecksumOptions::ChecksumOptions(MonoDelta timeout, int scan_concurrency,
 
 const uint64_t ChecksumOptions::kCurrentTimestamp = 0;
 
+tablet::TabletStatePB KsckTabletServer::ReplicaState(const std::string& tablet_id) const {
+  CHECK_EQ(state_, kFetched);
+  if (!ContainsKey(tablet_status_map_, tablet_id)) {
+    return tablet::UNKNOWN;
+  }
+  return tablet_status_map_.at(tablet_id).state();
+}
+
 KsckCluster::~KsckCluster() {
 }
 
@@ -126,7 +139,7 @@ Status Ksck::FetchTableAndTabletInfo() {
   return cluster_->FetchTableAndTabletInfo();
 }
 
-Status Ksck::CheckTabletServersRunning() {
+Status Ksck::FetchInfoFromTabletServers() {
   VLOG(1) << "Getting the Tablet Servers list";
   int servers_count = cluster_->tablet_servers().size();
   VLOG(1) << Substitute("List of $0 Tablet Servers retrieved", servers_count);
@@ -136,7 +149,7 @@ Status Ksck::CheckTabletServersRunning() {
   }
 
   int bad_servers = 0;
-  VLOG(1) << "Connecting to all the Tablet Servers";
+  VLOG(1) << "Fetching info from all the Tablet Servers";
   for (const KsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) {
     Status s = ConnectToTabletServer(entry.second);
     if (!s.ok()) {
@@ -144,10 +157,10 @@ Status Ksck::CheckTabletServersRunning() {
     }
   }
   if (bad_servers == 0) {
-    Info() << Substitute("Connected to all $0 Tablet Servers", servers_count) << endl;
+    Info() << Substitute("Fetched info from all $0 Tablet Servers", servers_count) << endl;
     return Status::OK();
   } else {
-    Warn() << Substitute("Connected to $0 Tablet Servers, $1 weren't reachable",
+    Warn() << Substitute("Fetched info from $0 Tablet Servers, $1 weren't reachable",
                          servers_count - bad_servers, bad_servers) << endl;
     return Status::NetworkError("Not all Tablet Servers are reachable");
   }
@@ -155,12 +168,12 @@ Status Ksck::CheckTabletServersRunning() {
 
 Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>& ts) {
   VLOG(1) << "Going to connect to Tablet Server: " << ts->uuid();
-  Status s = ts->Connect();
+  Status s = ts->FetchInfo();
   if (s.ok()) {
     VLOG(1) << "Connected to Tablet Server: " << ts->uuid();
   } else {
-    Warn() << Substitute("Unable to connect to Tablet Server $0 ($1) because $2",
-                         ts->uuid(), ts->address(), s.ToString()) << endl;
+    Warn() << Substitute("Unable to connect to Tablet Server $0: $1",
+                         ts->ToString(), s.ToString()) << endl;
   }
   return s;
 }
@@ -168,7 +181,7 @@ Status Ksck::ConnectToTabletServer(const shared_ptr<KsckTabletServer>& ts) {
 Status Ksck::CheckTablesConsistency() {
   VLOG(1) << "Getting the tables list";
   int tables_count = cluster_->tables().size();
-  VLOG(1) << Substitute("List of $0 tables retrieved", tables_count);
+  VLOG(1) << Substitute("List of $0 table(s) retrieved", tables_count);
 
   if (tables_count == 0) {
     Info() << "The cluster doesn't have any tables" << endl;
@@ -183,12 +196,12 @@ Status Ksck::CheckTablesConsistency() {
     }
   }
   if (bad_tables_count == 0) {
-    Info() << Substitute("The metadata for $0 tables is HEALTHY", tables_count) << endl;
+    Info() << Substitute("The metadata for $0 table(s) is HEALTHY", tables_count) << endl;
     return Status::OK();
   } else {
-    Warn() << Substitute("$0 out of $1 tables are not in a healthy state",
+    Warn() << Substitute("$0 out of $1 table(s) are not in a healthy state",
                          bad_tables_count, tables_count) << endl;
-    return Status::Corruption(Substitute("$0 tables are bad", bad_tables_count));
+    return Status::Corruption(Substitute("$0 table(s) are bad", bad_tables_count));
   }
 }
 
@@ -335,8 +348,18 @@ Status Ksck::ChecksumData(const vector<string>& tables,
   }
 
   if (options.use_snapshot && options.snapshot_timestamp == ChecksumOptions::kCurrentTimestamp) {
-    // Set the snapshot timestamp to the current timestamp of an arbitrary tablet server.
-    tablet_server_queues.begin()->first->CurrentTimestamp(&options.snapshot_timestamp);
+    // Set the snapshot timestamp to the current timestamp of the first healthy tablet server
+    // we can find.
+    for (const auto& ts : tablet_server_queues) {
+      if (ts.first->is_healthy()) {
+        options.snapshot_timestamp = ts.first->current_timestamp();
+        break;
+      }
+    }
+    if (options.snapshot_timestamp == ChecksumOptions::kCurrentTimestamp) {
+      return Status::ServiceUnavailable(
+          "No tablet servers were available to fetch the current timestamp");
+    }
     Info() << "Using snapshot timestamp: " << options.snapshot_timestamp << endl;
   }
 
@@ -462,18 +485,58 @@ bool Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int table_num_repl
   string tablet_str = Substitute("Tablet $0 of table '$1'",
                                  tablet->id(), tablet->table()->name());
   vector<shared_ptr<KsckTabletReplica> > replicas = tablet->replicas();
-  bool good_tablet = true;
-  if (replicas.size() != table_num_replicas) {
-    Warn() << Substitute("$0 has $1 instead of $2 replicas",
-                         tablet_str, replicas.size(), table_num_replicas) << endl;
-    // We only fail the "goodness" check if the tablet is under-replicated.
-    if (replicas.size() < table_num_replicas) {
-      good_tablet = false;
-    }
+  vector<string> warnings, errors;
+  if (check_replica_count_ && replicas.size() != table_num_replicas) {
+    warnings.push_back(Substitute("$0 has $1 instead of $2 replicas",
+                                  tablet_str, replicas.size(), table_num_replicas));
   }
   int leaders_count = 0;
   int followers_count = 0;
+  int alive_count = 0;
+  int running_count = 0;
   for (const shared_ptr<KsckTabletReplica> replica : replicas) {
+    VLOG(1) << Substitute("A replica of tablet $0 is on live tablet server $1",
+                          tablet->id(), replica->ts_uuid());
+    // Check for agreement on tablet assignment and state between the master
+    // and the tablet server.
+    auto ts = FindPtrOrNull(cluster_->tablet_servers(), replica->ts_uuid());
+    if (ts && ts->is_healthy()) {
+      alive_count++;
+      auto state = ts->ReplicaState(tablet->id());
+      if (state != tablet::UNKNOWN) {
+        VLOG(1) << Substitute("Tablet server $0 agrees that it hosts a replica of $1",
+                              ts->ToString(), tablet_str);
+      }
+
+      switch (state) {
+        case tablet::RUNNING:
+          VLOG(1) << Substitute("Tablet replica for $0 on TS $1 is RUNNING",
+                                tablet_str, ts->ToString());
+          running_count++;
+          break;
+
+        case tablet::UNKNOWN:
+          warnings.push_back(Substitute("Missing a tablet replica on tablet server $0",
+                                        ts->ToString()));
+          break;
+
+        default: {
+          const auto& status_pb = ts->tablet_status_map().at(tablet->id());
+          warnings.push_back(
+              Substitute("Bad state on TS $0: $1\n"
+                         "  Last status: $2\n"
+                         "  Data state:  $3",
+                         ts->ToString(), tablet::TabletStatePB_Name(state),
+                         status_pb.last_status(),
+                         tablet::TabletDataState_Name(status_pb.tablet_data_state())));
+          break;
+        }
+      }
+    } else {
+      // no TS or unhealthy TS
+      warnings.push_back(Substitute("Should have a replica on TS $0, but TS is unavailable",
+                                    ts ? ts->ToString() : replica->ts_uuid()));
+    }
     if (replica->is_leader()) {
       VLOG(1) << Substitute("Replica at $0 is a LEADER", replica->ts_uuid());
       leaders_count++;
@@ -483,17 +546,33 @@ bool Ksck::VerifyTablet(const shared_ptr<KsckTablet>& tablet, int table_num_repl
     }
   }
   if (leaders_count == 0) {
-    Warn() << Substitute("$0 doesn't have a leader", tablet_str) << endl;
-    good_tablet = false;
+    errors.push_back("No leader detected");
   }
   VLOG(1) << Substitute("$0 has $1 leader and $2 followers",
                         tablet_str, leaders_count, followers_count);
-  return good_tablet;
-}
+  int majority_size = consensus::MajoritySize(table_num_replicas);
+  if (alive_count < majority_size) {
+    errors.push_back(Substitute("$0 does not have a majority of replicas on live tablet servers",
+                                tablet_str));
+  } else if (running_count < majority_size) {
+    errors.push_back(Substitute("$0 does not have a majority of replicas in RUNNING state",
+                                tablet_str));
+  }
+
+  bool has_issues = !warnings.empty() || !errors.empty();
+  if (has_issues) {
+    Warn() << "Detected problems with " << tablet_str << endl
+           << "------------------------------------------------------------" << endl;
+    for (const auto& s : warnings) {
+      Warn() << s << endl;
+    }
+    for (const auto& s : errors) {
+      Error() << s << endl;
+    }
+    Out() << endl;
+  }
 
-Status Ksck::CheckAssignments() {
-  // TODO
-  return Status::NotSupported("CheckAssignments hasn't been implemented");
+  return !has_issues;
 }
 
 } // namespace tools

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 92e1951..502f68e 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -20,6 +20,7 @@
 #ifndef KUDU_TOOLS_KSCK_H
 #define KUDU_TOOLS_KSCK_H
 
+#include <gtest/gtest_prod.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -27,6 +28,7 @@
 #include <vector>
 
 #include "kudu/common/schema.h"
+#include "kudu/tablet/tablet.pb.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/status.h"
@@ -70,6 +72,7 @@ class KsckTabletReplica {
   KsckTabletReplica(const std::string ts_uuid, const bool is_leader, const bool is_follower)
       : is_leader_(is_leader),
         is_follower_(is_follower),
+        is_running_(false),
         ts_uuid_(ts_uuid) {
   }
 
@@ -88,6 +91,7 @@ class KsckTabletReplica {
  private:
   const bool is_leader_;
   const bool is_follower_;
+  bool is_running_;
   const std::string ts_uuid_;
   DISALLOW_COPY_AND_ASSIGN(KsckTabletReplica);
 };
@@ -168,13 +172,14 @@ typedef Callback<void(const Status& status, uint64_t checksum)> ReportResultCall
 // Class that must be extended to represent a tablet server.
 class KsckTabletServer {
  public:
+  // Map from tablet id to tablet replicas.
+  typedef std::unordered_map<std::string, tablet::TabletStatusPB > TabletStatusMap;
+
   explicit KsckTabletServer(std::string uuid) : uuid_(std::move(uuid)) {}
   virtual ~KsckTabletServer() { }
 
-  // Connects to the configured Tablet Server.
-  virtual Status Connect() const = 0;
-
-  virtual Status CurrentTimestamp(uint64_t* timestamp) const = 0;
+  // Connects to the configured tablet server and populates the fields of this class.
+  virtual Status FetchInfo() = 0;
 
   // Executes a checksum scan on the associated tablet, and runs the callback
   // with the result. The callback must be threadsafe and non-blocking.
@@ -188,10 +193,46 @@ class KsckTabletServer {
     return uuid_;
   }
 
+  std::string ToString() const {
+    return strings::Substitute("$0 ($1)", uuid(), address());
+  }
+
   virtual const std::string& address() const = 0;
 
+  bool is_healthy() const {
+    CHECK_NE(state_, kUninitialized);
+    return state_ == kFetched;
+  }
+
+  // Gets the mapping of tablet id to tablet replica for this tablet server.
+  const TabletStatusMap& tablet_status_map() const {
+    CHECK_EQ(state_, kFetched);
+    return tablet_status_map_;
+  }
+
+  tablet::TabletStatePB ReplicaState(const std::string& tablet_id) const;
+
+  uint64_t current_timestamp() const {
+    CHECK_EQ(state_, kFetched);
+    return timestamp_;
+  }
+
+ protected:
+  friend class KsckTest;
+  FRIEND_TEST(KsckTest, TestMismatchedAssignments);
+
+  enum State {
+    kUninitialized,
+    kFetchFailed,
+    kFetched
+  };
+  State state_ = kUninitialized;
+  TabletStatusMap tablet_status_map_;
+  uint64_t timestamp_;
+
  private:
   const std::string uuid_;
+
   DISALLOW_COPY_AND_ASSIGN(KsckTabletServer);
 };
 
@@ -240,8 +281,7 @@ class KsckCluster {
     return master_;
   }
 
-  const std::unordered_map<std::string,
-                           std::shared_ptr<KsckTabletServer> >& tablet_servers() {
+  const KsckMaster::TSMap& tablet_servers() {
     return tablet_servers_;
   }
 
@@ -260,7 +300,7 @@ class KsckCluster {
   Status RetrieveTabletsList(const std::shared_ptr<KsckTable>& table);
 
   const std::shared_ptr<KsckMaster> master_;
-  std::unordered_map<std::string, std::shared_ptr<KsckTabletServer> > tablet_servers_;
+  KsckMaster::TSMap tablet_servers_;
   std::vector<std::shared_ptr<KsckTable> > tables_;
   DISALLOW_COPY_AND_ASSIGN(KsckCluster);
 };
@@ -272,15 +312,22 @@ class Ksck {
       : cluster_(std::move(cluster)) {}
   ~Ksck() {}
 
-  // Verifies that it can connect to the Master.
+  // Set whether ksck should verify that each of the tablet's raft configurations
+  // has the same number of replicas that is specified by the tablet metadata.
+  // (default: true)
+  void set_check_replica_count(bool check) {
+    check_replica_count_ = check;
+  }
+
+  // Verifies that it can connect to the master.
   Status CheckMasterRunning();
 
-  // Populates all the cluster table and tablet info from the Master.
+  // Populates all the cluster table and tablet info from the master.
   Status FetchTableAndTabletInfo();
 
-  // Verifies that it can connect to all the Tablet Servers reported by the master.
-  // Must first call FetchTableAndTabletInfo().
-  Status CheckTabletServersRunning();
+  // Connects to all tablet servers, checks that they are alive, and fetches
+  // their current status and tablet information.
+  Status FetchInfoFromTabletServers();
 
   // Establishes a connection with the specified Tablet Server.
   // Must first call FetchTableAndTabletInfo().
@@ -288,7 +335,8 @@ class Ksck {
 
   // Verifies that all the tables have contiguous tablets and that each tablet has enough replicas
   // and a leader.
-  // Must first call FetchTableAndTabletInfo().
+  // Must first call FetchTableAndTabletInfo() and, if doing checks againt tablet
+  // servers (the default), must first call FetchInfoFromTabletServers().
   Status CheckTablesConsistency();
 
   // Verifies data checksums on all tablets by doing a scan of the database on each replica.
@@ -301,11 +349,6 @@ class Ksck {
                       const std::vector<std::string>& tablets,
                       const ChecksumOptions& options);
 
-  // Verifies that the assignments reported by the master are the same reported by the
-  // Tablet Servers.
-  // Must first call FetchTableAndTabletInfo().
-  Status CheckAssignments();
-
  private:
   bool VerifyTable(const std::shared_ptr<KsckTable>& table);
   bool VerifyTableWithTimeout(const std::shared_ptr<KsckTable>& table,
@@ -314,6 +357,8 @@ class Ksck {
   bool VerifyTablet(const std::shared_ptr<KsckTablet>& tablet, int table_num_replicas);
 
   const std::shared_ptr<KsckCluster> cluster_;
+
+  bool check_replica_count_ = true;
   DISALLOW_COPY_AND_ASSIGN(Ksck);
 };
 } // namespace tools

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 7305005..c10fabb 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -184,10 +184,9 @@ TEST_F(RemoteKsckTest, TestMasterOk) {
 }
 
 TEST_F(RemoteKsckTest, TestTabletServersOk) {
-  LOG(INFO) << "Fetching table and tablet info...";
+  ASSERT_OK(ksck_->CheckMasterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
-  LOG(INFO) << "Checking tablet servers are running...";
-  ASSERT_OK(ksck_->CheckTabletServersRunning());
+  ASSERT_OK(ksck_->FetchInfoFromTabletServers());
 }
 
 TEST_F(RemoteKsckTest, TestTableConsistency) {
@@ -195,7 +194,9 @@ TEST_F(RemoteKsckTest, TestTableConsistency) {
   deadline.AddDelta(MonoDelta::FromSeconds(30));
   Status s;
   while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) {
+    ASSERT_OK(ksck_->CheckMasterRunning());
     ASSERT_OK(ksck_->FetchTableAndTabletInfo());
+    ASSERT_OK(ksck_->FetchInfoFromTabletServers());
     s = ksck_->CheckTablesConsistency();
     if (s.ok()) {
       break;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index fcfae31..325865a 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -45,22 +45,44 @@ MonoDelta GetDefaultTimeout() {
   return MonoDelta::FromMilliseconds(FLAGS_timeout_ms);
 }
 
-Status RemoteKsckTabletServer::Connect() const {
-  tserver::PingRequestPB req;
-  tserver::PingResponsePB resp;
-  RpcController rpc;
-  rpc.set_timeout(GetDefaultTimeout());
-  return ts_proxy_->Ping(req, &resp, &rpc);
-}
+Status RemoteKsckTabletServer::FetchInfo() {
+  state_ = kFetchFailed;
+
+  {
+    tserver::PingRequestPB req;
+    tserver::PingResponsePB resp;
+    RpcController rpc;
+    rpc.set_timeout(GetDefaultTimeout());
+    RETURN_NOT_OK_PREPEND(ts_proxy_->Ping(req, &resp, &rpc),
+                          "could not send Ping RPC to server");
+  }
 
-Status RemoteKsckTabletServer::CurrentTimestamp(uint64_t* timestamp) const {
-  server::ServerClockRequestPB req;
-  server::ServerClockResponsePB resp;
-  RpcController rpc;
-  rpc.set_timeout(GetDefaultTimeout());
-  RETURN_NOT_OK(generic_proxy_->ServerClock(req, &resp, &rpc));
-  CHECK(resp.has_timestamp());
-  *timestamp = resp.timestamp();
+  {
+    tserver::ListTabletsRequestPB req;
+    tserver::ListTabletsResponsePB resp;
+    RpcController rpc;
+    rpc.set_timeout(GetDefaultTimeout());
+    req.set_need_schema_info(false);
+    RETURN_NOT_OK_PREPEND(ts_proxy_->ListTablets(req, &resp, &rpc),
+                          "could not list tablets");
+    tablet_status_map_.clear();
+    for (auto& status : *resp.mutable_status_and_schema()) {
+      tablet_status_map_[status.tablet_status().tablet_id()].Swap(status.mutable_tablet_status());
+    }
+  }
+
+  {
+    server::ServerClockRequestPB req;
+    server::ServerClockResponsePB resp;
+    RpcController rpc;
+    rpc.set_timeout(GetDefaultTimeout());
+    RETURN_NOT_OK_PREPEND(generic_proxy_->ServerClock(req, &resp, &rpc),
+                          "could not fetch timestamp");
+    CHECK(resp.has_timestamp());
+    timestamp_ = resp.timestamp();
+  }
+
+  state_ = kFetched;
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/ksck_remote.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h
index 5835b25..5ca2a3b 100644
--- a/src/kudu/tools/ksck_remote.h
+++ b/src/kudu/tools/ksck_remote.h
@@ -51,9 +51,7 @@ class RemoteKsckTabletServer : public KsckTabletServer {
         ts_proxy_(new tserver::TabletServerServiceProxy(messenger, address)) {
   }
 
-  virtual Status Connect() const OVERRIDE;
-
-  virtual Status CurrentTimestamp(uint64_t* timestamp) const OVERRIDE;
+  virtual Status FetchInfo() OVERRIDE;
 
   virtual void RunTabletChecksumScanAsync(
       const std::string& tablet_id,
@@ -71,6 +69,7 @@ class RemoteKsckTabletServer : public KsckTabletServer {
   const std::shared_ptr<rpc::Messenger> messenger_;
   const std::shared_ptr<server::GenericServiceProxy> generic_proxy_;
   const std::shared_ptr<tserver::TabletServerServiceProxy> ts_proxy_;
+
 };
 
 // This implementation connects to a Master via RPC.

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tools/kudu-ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-ksck.cc b/src/kudu/tools/kudu-ksck.cc
index dd98049..e704f2e 100644
--- a/src/kudu/tools/kudu-ksck.cc
+++ b/src/kudu/tools/kudu-ksck.cc
@@ -100,8 +100,8 @@ static void RunKsck(vector<string>* error_messages) {
                       "Error fetching the cluster metadata from the Master server");
   if (!error_messages->empty()) return;
 
-  PUSH_PREPEND_NOT_OK(ksck->CheckTabletServersRunning(), error_messages,
-                      "Tablet server aliveness check error");
+  PUSH_PREPEND_NOT_OK(ksck->FetchInfoFromTabletServers(), error_messages,
+                      "Error fetching info from tablet servers");
 
   // TODO: Add support for tables / tablets filter in the consistency check.
   PUSH_PREPEND_NOT_OK(ksck->CheckTablesConsistency(), error_messages,

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tserver/tablet_service.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index 85e24a2..5a9c04b 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -1106,9 +1106,12 @@ void TabletServiceImpl::ListTablets(const ListTabletsRequestPB* req,
   for (const scoped_refptr<TabletPeer>& peer : peers) {
     StatusAndSchemaPB* status = peer_status->Add();
     peer->GetTabletStatusPB(status->mutable_tablet_status());
-    CHECK_OK(SchemaToPB(peer->status_listener()->schema(),
-                        status->mutable_schema()));
-    peer->tablet_metadata()->partition_schema().ToPB(status->mutable_partition_schema());
+
+    if (req->need_schema_info()) {
+      CHECK_OK(SchemaToPB(peer->status_listener()->schema(),
+                          status->mutable_schema()));
+      peer->tablet_metadata()->partition_schema().ToPB(status->mutable_partition_schema());
+    }
   }
   context->RespondSuccess();
 }

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/513d6e9f/src/kudu/tserver/tserver.proto
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index 9df226a..e44c1bc 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -156,6 +156,10 @@ message WriteResponsePB {
 
 // A list tablets request
 message ListTabletsRequestPB {
+  // Whether the server should include schema information in the response.
+  // These fields can be relatively large, so not including it can make this call
+  // less heavy-weight.
+  optional bool need_schema_info = 1 [default = true];
 }
 
 // A list tablets response
@@ -164,7 +168,10 @@ message ListTabletsResponsePB {
 
   message StatusAndSchemaPB {
     required tablet.TabletStatusPB tablet_status = 1;
-    required SchemaPB schema = 2;
+
+    // 'schema' and 'partition_schema' will only be included if the original request
+    // set 'need_schema_info'.
+    optional SchemaPB schema = 2;
     optional PartitionSchemaPB partition_schema = 3;
   }