You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/12/08 16:14:38 UTC

[6/6] kudu git commit: KUDU-1796: Use last REPLICATE OpId instead of last COMMIT OpId to tombstone a tablet replica

KUDU-1796: Use last REPLICATE OpId instead of last COMMIT OpId to tombstone a tablet replica

The kudu-tool-test has been flaky because of the following failure:

kudu-tool-test.cc:1203: Failure
Value of: tombstoned_opid.index()
  Actual: 205
  Expected: last_logged_opid.index()
  Which is: 206

Issue here is, offline cli tool "local_replica delete" stored
last COMMIT OpId on the tablet superblock instead of last REPLICATE
OpId while tombstoning the tablet replica. Hence kudu-tool-test
ToolTest.TestLocalReplicaTombstoneDelete observed that sometimes
tombstoned_opid (COMMIT) lagged behind last_logged_opid (REPLICATE).

This patch fixes the tool to use the REPLICATE OpId from the WAL.
Testing: Ran 5000 iterations of current failing test.

Change-Id: I02e95f05fab80e94b1afd078b23e5e03eca6e42a
Reviewed-on: http://gerrit.cloudera.org:8080/5416
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/839bd6f9
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/839bd6f9
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/839bd6f9

Branch: refs/heads/master
Commit: 839bd6f9abe3dedf439628e80fd7111763c11b52
Parents: 4d8fe6c
Author: Dinesh Bhat <di...@cloudera.com>
Authored: Thu Dec 8 13:03:41 2016 +0530
Committer: Mike Percy <mp...@apache.org>
Committed: Thu Dec 8 15:49:41 2016 +0000

----------------------------------------------------------------------
 src/kudu/tools/tool_action_local_replica.cc | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/839bd6f9/src/kudu/tools/tool_action_local_replica.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_local_replica.cc b/src/kudu/tools/tool_action_local_replica.cc
index b788748..7e0cd22 100644
--- a/src/kudu/tools/tool_action_local_replica.cc
+++ b/src/kudu/tools/tool_action_local_replica.cc
@@ -162,18 +162,18 @@ Status ParseHostPortString(const string& hostport_str, HostPort* hostport) {
   return Status::OK();
 }
 
-// Find the last committed OpId for the tablet_id from the WAL.
-Status FindLastCommittedOpId(FsManager* fs, const string& tablet_id,
-                             OpId* last_committed_opid) {
+// Find the last replicated OpId for the tablet_id from the WAL.
+Status FindLastLoggedOpId(FsManager* fs, const string& tablet_id,
+                          OpId* last_logged_opid) {
   shared_ptr<LogReader> reader;
   RETURN_NOT_OK(LogReader::Open(fs, scoped_refptr<log::LogIndex>(), tablet_id,
                                 scoped_refptr<MetricEntity>(), &reader));
   SegmentSequence segs;
   RETURN_NOT_OK(reader->GetSegmentsSnapshot(&segs));
-  // Reverse iterate segments to find the first 'last committed' entry.
+  // Reverse iterate the segments to find the 'last replicated' entry quickly.
   // Note that we still read the entries within a segment in sequential
-  // fashion, so the last entry within the 'found' segment will
-  // give us the last_committed_opid.
+  // fashion, so the last entry within the first 'found' segment will
+  // give us the last_logged_opid.
   vector<scoped_refptr<ReadableLogSegment>>::reverse_iterator seg;
   bool found = false;
   for (seg = segs.rbegin(); seg != segs.rend(); ++seg) {
@@ -183,13 +183,13 @@ Status FindLastCommittedOpId(FsManager* fs, const string& tablet_id,
       Status s = reader.ReadNextEntry(&entry);
       if (s.IsEndOfFile()) break;
       RETURN_NOT_OK_PREPEND(s, "Error in log segment");
-      if (entry.type() != log::COMMIT) continue;
-      *last_committed_opid = entry.commit().commited_op_id();
+      if (entry.type() != log::REPLICATE) continue;
+      *last_logged_opid = entry.replicate().id();
       found = true;
     }
     if (found) return Status::OK();
   }
-  return Status::NotFound("Committed OpId not found in the log");
+  return Status::NotFound("No entries found in the write-ahead log");
 }
 
 // Parses a colon-delimited string containing a uuid, hostname or IP address,
@@ -300,7 +300,7 @@ Status DeleteLocalReplica(const RunnerContext& context) {
   const string& tablet_id = FindOrDie(context.required_args, kTabletIdArg);
   FsManager fs_manager(Env::Default(), FsManagerOpts());
   RETURN_NOT_OK(fs_manager.Open());
-  boost::optional<OpId> last_committed_opid = boost::none;
+  boost::optional<OpId> last_logged_opid = boost::none;
   TabletDataState state = TabletDataState::TABLET_DATA_DELETED;
   if (!FLAGS_clean_unsafe) {
     state = TabletDataState::TABLET_DATA_TOMBSTONED;
@@ -308,14 +308,14 @@ Status DeleteLocalReplica(const RunnerContext& context) {
     // the log, it's not an error. But if we receive any other error,
     // indicate the user to delete with --clean_unsafe flag.
     OpId opid;
-    Status s = FindLastCommittedOpId(&fs_manager, tablet_id, &opid);
+    Status s = FindLastLoggedOpId(&fs_manager, tablet_id, &opid);
     if (s.ok()) {
-      last_committed_opid = opid;
+      last_logged_opid = opid;
     } else if (s.IsNotFound()) {
-      LOG(INFO) << "Could not find last committed OpId from WAL, "
+      LOG(INFO) << "Could not find any replicated OpId from WAL, "
                 << "but proceeding with tablet tombstone: " << s.ToString();
     } else {
-      LOG(ERROR) << "Error attempting to find last committed OpId in WAL: " << s.ToString();
+      LOG(ERROR) << "Error attempting to find last replicated OpId from WAL: " << s.ToString();
       LOG(ERROR) << "Cannot delete (tombstone) the tablet, use --clean_unsafe to delete"
                  << " the tablet permanently from this server.";
       return s;
@@ -325,7 +325,7 @@ Status DeleteLocalReplica(const RunnerContext& context) {
   // Force the specified tablet on this node to be in 'state'.
   scoped_refptr<TabletMetadata> meta;
   RETURN_NOT_OK(TabletMetadata::Load(&fs_manager, tablet_id, &meta));
-  RETURN_NOT_OK(TSTabletManager::DeleteTabletData(meta, state, last_committed_opid));
+  RETURN_NOT_OK(TSTabletManager::DeleteTabletData(meta, state, last_logged_opid));
   return Status::OK();
 }