You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/12/11 19:55:09 UTC

[kudu] branch master updated: [master] KUDU-3017 clean-up SysCatalogTable::SetupTablet()

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 46b4d6f  [master] KUDU-3017 clean-up SysCatalogTable::SetupTablet()
46b4d6f is described below

commit 46b4d6f24fdacb79aa574a016bf5a6cb51d5e3b8
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Tue Dec 10 16:44:19 2019 -0800

    [master] KUDU-3017 clean-up SysCatalogTable::SetupTablet()
    
    If master fails to replay orphaned operations from the WAL during
    the bootstrap, it crashes at the system tablet's state check
    (the 'orphaned operations' are REPLICATE messages which are in the WAL
    with no accompanying COMMIT):
    
        F1206 01:32:15.488359 1324967 tablet_replica.cc:138] Check failed: state_ == SHUTDOWN || state_ == FAILED TabletReplica not fully shut down. State: BOOTSTRAPPING
    
    This patch addresses the issue, so master would not crash at the tablet
    state consistency CHECK() under such conditions.  Instead, now it
    reports corresponding error and crashes at the higher level
    in master_main.cc.  With this patch, it's easier to attribute
    a failure to the root cause by looking into the master's log.
    
    I didn't add any tests: the replay of the orphaned WAL transactions
    during bootstrap has pretty good coverage already.
    
    Change-Id: I6adfd7f74fdd2e05e04f6418cbf9bb86cad6465a
    Reviewed-on: http://gerrit.cloudera.org:8080/14881
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/kserver/kserver.cc    |  4 +--
 src/kudu/master/master_main.cc |  2 --
 src/kudu/master/sys_catalog.cc | 82 +++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/src/kudu/kserver/kserver.cc b/src/kudu/kserver/kserver.cc
index 6b0adf3..1a46c4f 100644
--- a/src/kudu/kserver/kserver.cc
+++ b/src/kudu/kserver/kserver.cc
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <initializer_list>
 #include <memory>
 #include <mutex>
 #include <ostream>
@@ -161,8 +162,7 @@ Status KuduServer::Init() {
 }
 
 Status KuduServer::Start() {
-  RETURN_NOT_OK(ServerBase::Start());
-  return Status::OK();
+  return ServerBase::Start();
 }
 
 void KuduServer::Shutdown() {
diff --git a/src/kudu/master/master_main.cc b/src/kudu/master/master_main.cc
index d2f784a..4128804 100644
--- a/src/kudu/master/master_main.cc
+++ b/src/kudu/master/master_main.cc
@@ -19,7 +19,6 @@
 #include <string>
 
 #include <gflags/gflags.h>
-#include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
 #include "kudu/gutil/strings/substitute.h"
@@ -103,7 +102,6 @@ static int MasterMain(int argc, char** argv) {
   MasterOptions opts;
   Master server(opts);
   CHECK_OK(server.Init());
-
   CHECK_OK(server.Start());
 
   while (true) {
diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc
index 372294b..2cd36a9 100644
--- a/src/kudu/master/sys_catalog.cc
+++ b/src/kudu/master/sys_catalog.cc
@@ -228,8 +228,7 @@ Status SysCatalogTable::Load(FsManager *fs_manager) {
     }
   }
 
-  RETURN_NOT_OK(SetupTablet(metadata));
-  return Status::OK();
+  return SetupTablet(metadata);
 }
 
 Status SysCatalogTable::CreateNew(FsManager *fs_manager) {
@@ -352,11 +351,22 @@ void SysCatalogTable::SysCatalogStateChanged(const string& tablet_id, const stri
   }
 }
 
-Status SysCatalogTable::SetupTablet(const scoped_refptr<tablet::TabletMetadata>& metadata) {
-  shared_ptr<Tablet> tablet;
-  scoped_refptr<Log> log;
+Status SysCatalogTable::SetupTablet(
+    const scoped_refptr<tablet::TabletMetadata>& metadata) {
+
+#define RETURN_NOT_OK_SHUTDOWN(s, m) \
+  do { \
+    const auto& _s = (s);                 \
+    if (PREDICT_FALSE(!_s.ok())) {        \
+      tablet_replica_->SetError(_s);      \
+      tablet_replica_->Shutdown();        \
+      return _s.CloneAndPrepend((m));     \
+    }                                     \
+  } while (0)
 
   InitLocalRaftPeerPB();
+  scoped_refptr<ConsensusMetadata> cmeta;
+  RETURN_NOT_OK(cmeta_manager_->Load(metadata->tablet_id(), &cmeta));
 
   // TODO(matteo): handle crash mid-creation of tablet? do we ever end up with
   // a partially created tablet here?
@@ -365,49 +375,47 @@ Status SysCatalogTable::SetupTablet(const scoped_refptr<tablet::TabletMetadata>&
       cmeta_manager_,
       local_peer_pb_,
       master_->tablet_apply_pool(),
-      Bind(&SysCatalogTable::SysCatalogStateChanged, Unretained(this), metadata->tablet_id())));
-  Status s = tablet_replica_->Init(master_->raft_pool());
-  if (!s.ok()) {
-    tablet_replica_->SetError(s);
-    tablet_replica_->Shutdown();
-    return s;
-  }
-
-  scoped_refptr<ConsensusMetadata> cmeta;
-  RETURN_NOT_OK(cmeta_manager_->Load(metadata->tablet_id(), &cmeta));
+      Bind(&SysCatalogTable::SysCatalogStateChanged,
+           Unretained(this),
+           metadata->tablet_id())));
+  RETURN_NOT_OK_SHUTDOWN(tablet_replica_->Init(master_->raft_pool()),
+                         "failed to initialize system catalog replica");
 
+  shared_ptr<Tablet> tablet;
+  scoped_refptr<Log> log;
   consensus::ConsensusBootstrapInfo consensus_info;
   tablet_replica_->SetBootstrapping();
-  RETURN_NOT_OK(BootstrapTablet(metadata,
-                                cmeta->CommittedConfig(),
-                                scoped_refptr<clock::Clock>(master_->clock()),
-                                master_->mem_tracker(),
-                                scoped_refptr<rpc::ResultTracker>(),
-                                metric_registry_,
-                                tablet_replica_,
-                                &tablet,
-                                &log,
-                                tablet_replica_->log_anchor_registry(),
-                                &consensus_info));
+  RETURN_NOT_OK_SHUTDOWN(BootstrapTablet(
+      metadata,
+      cmeta->CommittedConfig(),
+      scoped_refptr<clock::Clock>(master_->clock()),
+      master_->mem_tracker(),
+      scoped_refptr<rpc::ResultTracker>(),
+      metric_registry_,
+      tablet_replica_,
+      &tablet,
+      &log,
+      tablet_replica_->log_anchor_registry(),
+      &consensus_info), "failed to bootstrap system catalog");
 
   // TODO(matteo): Do we have a setSplittable(false) or something from the
   // outside is handling split in the TS?
 
-  RETURN_NOT_OK_PREPEND(tablet_replica_->Start(consensus_info,
-                                               tablet,
-                                               scoped_refptr<clock::Clock>(master_->clock()),
-                                               master_->messenger(),
-                                               scoped_refptr<rpc::ResultTracker>(),
-                                               log,
-                                               master_->tablet_prepare_pool(),
-                                               master_->dns_resolver()),
-                        "Failed to Start() TabletReplica");
+  RETURN_NOT_OK_SHUTDOWN(tablet_replica_->Start(
+      consensus_info,
+      tablet,
+      scoped_refptr<clock::Clock>(master_->clock()),
+      master_->messenger(),
+      scoped_refptr<rpc::ResultTracker>(),
+      log,
+      master_->tablet_prepare_pool(),
+      master_->dns_resolver()), "failed to start system catalog replica");
 
   tablet_replica_->RegisterMaintenanceOps(master_->maintenance_manager());
 
-  const Schema* schema = tablet->schema();
-  schema_ = SchemaBuilder(*schema).BuildWithoutIds();
+  schema_ = SchemaBuilder(*tablet->schema()).BuildWithoutIds();
   key_schema_ = schema_.CreateKeyProjection();
+
   return Status::OK();
 }