You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2016/09/01 22:44:26 UTC

kudu git commit: ts_tablet_manager: set status message after failed tablet copy

Repository: kudu
Updated Branches:
  refs/heads/master e1f54cbe7 -> 680cec6bb


ts_tablet_manager: set status message after failed tablet copy

I noticed in a test cluster that a tablet copy had failed, and ksck was
reporting the state as:

  b314b10216d74c07ab9da974671137db (ve0136.halxg.cloudera.com:7050): bad state
    State:       NOT_STARTED
    Data state:  TABLET_DATA_TOMBSTONED
    Last status: TabletCopy: Downloading block 4611685870251245460 (2342/33956)

This was rather confusing, because it claimed it was downloading a
block, and also claimed to be tombstoned at the same time (rather than
being in the COPYING state).

This fixes the failure case to set the status message properly.

Change-Id: I36ecd45dacec4ea329a0f0dd236e52bc603f9b93
Reviewed-on: http://gerrit.cloudera.org:8080/4194
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/680cec6b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/680cec6b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/680cec6b

Branch: refs/heads/master
Commit: 680cec6bbf104dcba0d3d350e34af98756a5d974
Parents: e1f54cb
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Aug 31 21:27:16 2016 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Thu Sep 1 22:44:18 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/delete_table-test.cc |  8 ++++++++
 src/kudu/tserver/ts_tablet_manager.cc           | 18 +++++++++---------
 src/kudu/tserver/ts_tablet_manager.h            |  2 +-
 3 files changed, 18 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/680cec6b/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index 437730d..4c8de2a 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -550,6 +550,14 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterTabletCopyRemoteFails) {
   cluster_->tablet_server(2)->Shutdown();
   NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_NOT_EXPECTED));
 
+  // Check the textual status message for the failed copy.
+  {
+    vector<ListTabletsResponsePB::StatusAndSchemaPB> status_pbs;
+    ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &status_pbs));
+    ASSERT_STR_CONTAINS(status_pbs[0].tablet_status().last_status(),
+                        "Tombstoned tablet: Tablet Copy: Unable to fetch data from remote peer");
+  }
+
   // Now bring the other replicas back, re-elect the previous leader (TS-1),
   // and wait for the leader to Tablet Copy the tombstoned replica. This
   // will have replaced a tablet with no consensus metadata.

http://git-wip-us.apache.org/repos/asf/kudu/blob/680cec6b/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 0e8b52a..8b8eb85 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -311,11 +311,11 @@ Status TSTabletManager::CreateNewTablet(const string& table_id,
 
 // If 'expr' fails, log a message, tombstone the given tablet, and return the
 // error status.
-#define TOMBSTONE_NOT_OK(expr, meta, msg) \
+#define TOMBSTONE_NOT_OK(expr, peer, msg) \
   do { \
     Status _s = (expr); \
     if (PREDICT_FALSE(!_s.ok())) { \
-      LogAndTombstone((meta), (msg), _s); \
+      LogAndTombstone((peer), (msg), _s); \
       return _s; \
     } \
   } while (0)
@@ -436,7 +436,7 @@ Status TSTabletManager::StartTabletCopy(
 
   // Download all of the remote files.
   Status s = tc_client.FetchAll(implicit_cast<TabletStatusListener*>(tablet_peer.get()));
-  TOMBSTONE_NOT_OK(s, meta,
+  TOMBSTONE_NOT_OK(s, tablet_peer,
                    "Tablet Copy: Unable to fetch data from remote peer " +
                    copy_source_uuid + " (" + copy_source_addr.ToString() + ")");
 
@@ -444,7 +444,7 @@ Status TSTabletManager::StartTabletCopy(
 
   // Write out the last files to make the new replica visible and update the
   // TabletDataState in the superblock to TABLET_DATA_READY.
-  TOMBSTONE_NOT_OK(tc_client.Finish(), meta, "Tablet Copy: Failure calling Finish()");
+  TOMBSTONE_NOT_OK(tc_client.Finish(), tablet_peer, "Tablet Copy: Failure calling Finish()");
 
   // We run this asynchronously. We don't tombstone the tablet if this fails,
   // because if we were to fail to open the tablet, on next startup, it's in a
@@ -941,21 +941,21 @@ Status TSTabletManager::DeleteTabletData(const scoped_refptr<TabletMetadata>& me
   return meta->DeleteSuperBlock();
 }
 
-void TSTabletManager::LogAndTombstone(const scoped_refptr<TabletMetadata>& meta,
+void TSTabletManager::LogAndTombstone(const scoped_refptr<TabletPeer>& peer,
                                       const std::string& msg,
                                       const Status& s) {
-  const string& tablet_id = meta->tablet_id();
+  const string& tablet_id = peer->tablet_id();
   const string kLogPrefix = "T " + tablet_id + " P " + fs_manager_->uuid() + ": ";
   LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString();
 
-  // Tombstone the tablet when tablet copy fails.
-  LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed tablet copy";
-  Status delete_status = DeleteTabletData(meta, TABLET_DATA_TOMBSTONED, boost::optional<OpId>());
+  Status delete_status = DeleteTabletData(
+      peer->tablet_metadata(), TABLET_DATA_TOMBSTONED, boost::optional<OpId>());
   if (PREDICT_FALSE(!delete_status.ok())) {
     // This failure should only either indicate a bug or an IO error.
     LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after tablet copy: "
                << delete_status.ToString();
   }
+  peer->StatusMessage(Substitute("Tombstoned tablet: $0 ($1)", msg, s.ToString()));
 }
 
 TransitionInProgressDeleter::TransitionInProgressDeleter(

http://git-wip-us.apache.org/repos/asf/kudu/blob/680cec6b/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 8005683..dcdc76e 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -267,7 +267,7 @@ class TSTabletManager : public tserver::TabletPeerLookupIf {
   // Print a log message using the given info and tombstone the specified
   // tablet. If tombstoning the tablet fails, a FATAL error is logged, resulting
   // in a crash.
-  void LogAndTombstone(const scoped_refptr<tablet::TabletMetadata>& meta,
+  void LogAndTombstone(const scoped_refptr<tablet::TabletPeer>& peer,
                        const std::string& msg,
                        const Status& s);