You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/09/06 23:36:44 UTC

[1/2] kudu git commit: KUDU-2119. Fix failure in encoding-test

Repository: kudu
Updated Branches:
  refs/heads/master a71ecfd4a -> 4f41a3cb6


KUDU-2119. Fix failure in encoding-test

In commit d1f53cc32 I introduced randomization for the format string
used for the generated string data in this test. The random format
string could sometimes incorporate '\0' bytes, which, in the worst case,
could result in a string of length 0 or 1. This would then cause a later
assertion to fail that was checking that the encoded data be at least
two bytes per string.

The fix switches from using a printf-style string to instead use a
std::function to generate the data. The implementation of the function
avoids using C strings and thus permits embedded null bytes.

Change-Id: Ic46d2a993235e560475d931c7b023eb5b4faf7c2
Reviewed-on: http://gerrit.cloudera.org:8080/7967
Tested-by: Kudu Jenkins
Reviewed-by: Hao Hao <ha...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2211afcb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2211afcb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2211afcb

Branch: refs/heads/master
Commit: 2211afcb3c283209c71e26c845bd9ad6c2a13119
Parents: a71ecfd
Author: Todd Lipcon <to...@apache.org>
Authored: Tue Sep 5 17:34:52 2017 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Sep 6 22:20:03 2017 +0000

----------------------------------------------------------------------
 src/kudu/cfile/encoding-test.cc | 67 +++++++++++++++++-------------------
 src/kudu/util/random_util.cc    | 13 ++++++-
 src/kudu/util/random_util.h     |  5 +++
 3 files changed, 48 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/2211afcb/src/kudu/cfile/encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/encoding-test.cc b/src/kudu/cfile/encoding-test.cc
index ba10ed4..9dcdcb4 100644
--- a/src/kudu/cfile/encoding-test.cc
+++ b/src/kudu/cfile/encoding-test.cc
@@ -22,6 +22,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <functional>
 #include <limits>
 #include <memory>
 #include <ostream>
@@ -44,7 +45,6 @@
 #include "kudu/common/schema.h"
 #include "kudu/common/types.h"
 #include "kudu/gutil/gscoped_ptr.h"
-#include "kudu/gutil/macros.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/gutil/strings/substitute.h"
@@ -89,15 +89,17 @@ class TestEncoding : public KuduTest {
   }
 
   // Insert a given number of strings into the provided BlockBuilder.
+  //
+  // The strings are generated using the provided 'formatter' function.
   template<class BuilderType>
   static Slice CreateBinaryBlock(BuilderType *sbb,
                                  int num_items,
-                                 const char *fmt_str) {
-    vector<unique_ptr<string>> to_insert;
+                                 const std::function<string(int)>& formatter) {
+    vector<string> to_insert(num_items);
     vector<Slice> slices;
-    for (uint i = 0; i < num_items; i++) {
-      to_insert.emplace_back(new string(StringPrintf(fmt_str, i)));
-      slices.emplace_back(to_insert.back()->data());
+    for (int i = 0; i < num_items; i++) {
+      to_insert[i] = formatter(i);
+      slices.emplace_back(to_insert[i]);
     }
 
     int rem = slices.size();
@@ -126,7 +128,8 @@ class TestEncoding : public KuduTest {
     BuilderType sbb(opts.get());
     // Insert "hello 0" through "hello 9"
     const uint kCount = 10;
-    Slice s = CreateBinaryBlock(&sbb, kCount, "hello %d");
+    Slice s = CreateBinaryBlock(
+        &sbb, kCount, std::bind(StringPrintf, "hello %d", std::placeholders::_1));
     DecoderType sbd(s);
     ASSERT_OK(sbd.ParseHeader());
 
@@ -180,7 +183,8 @@ class TestEncoding : public KuduTest {
     BinaryPrefixBlockBuilder sbb(opts.get());
     const uint kCount = 1000;
     // Insert 'hello 000' through 'hello 999'
-    Slice s = CreateBinaryBlock(&sbb, kCount, "hello %03d");
+    Slice s = CreateBinaryBlock(
+        &sbb, kCount, std::bind(StringPrintf, "hello %03d", std::placeholders::_1));
     BinaryPrefixBlockDecoder sbd(s);
     ASSERT_OK(sbd.ParseHeader());
 
@@ -251,44 +255,34 @@ class TestEncoding : public KuduTest {
     }
   }
 
-  // Create a printf-style format string of the form 'XXXXXXXX%d' where the 'X' characters
-  // are random bytes (not including '%', possibly including non-printable characters).
-  static string RandomFormatString(Random* r) {
-    char buf[8];
-    RandomString(buf, arraysize(buf), r);
-    string format_string(buf, arraysize(buf));
-    for (int i = 0; i < format_string.size(); i++) {
-      if (format_string[i] == '%') {
-        format_string[i] = 'x';
-      }
-    }
-    return format_string + "%d";
-  }
-
   template<class BuilderType, class DecoderType>
   void TestBinaryBlockRoundTrip() {
     gscoped_ptr<WriterOptions> opts(NewWriterOptions());
     BuilderType sbb(opts.get());
 
-    // Generate a random format string which uses some binary data.
+    auto seed = SeedRandom();
+    Random r(seed);
+
+    // For each row, generate random data based on this run's seed.
     // Using random data helps the ability to trigger bugs like KUDU-2085.
-    Random r(SeedRandom());
-    const string& format_string = RandomFormatString(&r);
-    const auto& GenTestString = [&](int x) {
-      return StringPrintf(format_string.c_str(), x);
+    const auto& GenTestString = [seed](int i) {
+      Random local_rng(seed + i);
+      int len = local_rng.Uniform(8);
+      return RandomString(len, &local_rng);
     };
 
-    // Use a random number of elements. This is necessary to trigger bugs
-    // like KUDU-2085 that only occur in specific cases such as when
-    // the number of elements is a multiple of the 'restart interval'
-    // in prefix-encoded blocks.
-    const uint kCount = r.Uniform(1000);
-    Slice s = CreateBinaryBlock(&sbb, kCount, format_string.c_str());
+    // Use a random number of elements (but at least 1).
+    //
+    // This is necessary to trigger bugs like KUDU-2085 that only occur in specific cases
+    // such as when the number of elements is a multiple of the 'restart interval' in
+    // prefix-encoded blocks.
+    const uint kCount = r.Uniform(1000) + 1;
+    Slice s = CreateBinaryBlock(&sbb, kCount, GenTestString);
 
     LOG(INFO) << "Block: " << HexDump(s);
 
-    // the slice should take at least a few bytes per entry
-    ASSERT_GT(s.size(), kCount * 2u);
+    // The encoded data should take at least 1 byte per entry.
+    ASSERT_GT(s.size(), kCount);
 
     // Check first/last keys
     Slice key;
@@ -480,7 +474,8 @@ class TestEncoding : public KuduTest {
     const uint kCount = 10;
     size_t sbsize;
 
-    Slice s = CreateBinaryBlock(&sbb, kCount, "hello %d");
+    Slice s = CreateBinaryBlock(
+        &sbb, kCount, std::bind(StringPrintf, "hello %d", std::placeholders::_1));
     do {
       sbsize = s.size();
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/2211afcb/src/kudu/util/random_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/random_util.cc b/src/kudu/util/random_util.cc
index a102a32..acd4084 100644
--- a/src/kudu/util/random_util.cc
+++ b/src/kudu/util/random_util.cc
@@ -21,10 +21,14 @@
 
 #include <cstdlib>
 #include <cstring>
+#include <string>
 
+#include "kudu/gutil/walltime.h"
 #include "kudu/util/env.h"
+#include "kudu/util/faststring.h"
 #include "kudu/util/random.h"
-#include "kudu/gutil/walltime.h"
+
+using std::string;
 
 namespace kudu {
 
@@ -42,6 +46,13 @@ void RandomString(void* dest, size_t n, Random* rng) {
   memcpy(cdest + i, &random, n - i);
 }
 
+string RandomString(size_t n, Random* rng) {
+  faststring s;
+  s.resize(n);
+  RandomString(s.data(), n, rng);
+  return s.ToString();
+}
+
 uint32_t GetRandomSeed32() {
   uint32_t seed = static_cast<uint32_t>(GetCurrentTimeMicros());
   seed *= getpid();

http://git-wip-us.apache.org/repos/asf/kudu/blob/2211afcb/src/kudu/util/random_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/random_util.h b/src/kudu/util/random_util.h
index e286bbe..bda8c42 100644
--- a/src/kudu/util/random_util.h
+++ b/src/kudu/util/random_util.h
@@ -21,6 +21,8 @@
 #include <cstdlib>
 #include <stdint.h>
 
+#include <string>
+
 namespace kudu {
 
 class Random;
@@ -30,6 +32,9 @@ class Random;
 // be written to dest with the same probability as any other byte.
 void RandomString(void* dest, size_t n, Random* rng);
 
+// Same as the above, but returns the string.
+std::string RandomString(size_t n, Random* rng);
+
 // Generate a 32-bit random seed from several sources, including timestamp,
 // pid & tid.
 uint32_t GetRandomSeed32();


[2/2] kudu git commit: tablet copy: Allow voting from crashed initial tablet copies

Posted by mp...@apache.org.
tablet copy: Allow voting from crashed initial tablet copies

The change in c35e4f0093dcdec625d1f647924d854c7bc9f3de missed an
important case, which is when the target tablet server in a tablet copy
operation crashes while in the middle of a tablet copy. In that case,
the 'tombstone_last_logged_opid' of 1.0 was not persisted in the
superblock. This manifested as a very flaky
TabletCopyITest.TestTabletCopyNewReplicaFailureCanVote test, which would
fail about 15% of the time.

With the change in this patch, that test now passes reliably:

http://dist-test.cloudera.org/job?job_id=mpercy.1504654266.31446

Change-Id: Idbc9cecac701c24ebe05759adbbb1fbe2b140506
Reviewed-on: http://gerrit.cloudera.org:8080/7961
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/4f41a3cb
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/4f41a3cb
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/4f41a3cb

Branch: refs/heads/master
Commit: 4f41a3cb68019f5a56764a126de5debffe5b58d6
Parents: 2211afc
Author: Mike Percy <mp...@apache.org>
Authored: Tue Sep 5 12:39:55 2017 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Wed Sep 6 23:35:51 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus-test-util.h        |   4 +-
 src/kudu/integration-tests/tablet_copy-itest.cc | 155 ++++++++++++++++---
 src/kudu/master/sys_catalog.cc                  |   3 +
 src/kudu/tablet/tablet-harness.h                |   1 +
 src/kudu/tablet/tablet_bootstrap-test.cc        |   2 +
 src/kudu/tablet/tablet_metadata.cc              |  17 +-
 src/kudu/tablet/tablet_metadata.h               |   9 +-
 src/kudu/tools/kudu-tool-test.cc                |   4 +-
 src/kudu/tserver/tablet_copy_client.cc          |  16 +-
 src/kudu/tserver/ts_tablet_manager.cc           |   1 +
 src/kudu/tserver/ts_tablet_manager.h            |   5 +
 11 files changed, 182 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/consensus/consensus-test-util.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus-test-util.h b/src/kudu/consensus/consensus-test-util.h
index c73e50b..409841b 100644
--- a/src/kudu/consensus/consensus-test-util.h
+++ b/src/kudu/consensus/consensus-test-util.h
@@ -48,8 +48,8 @@
 
 #define ASSERT_OPID_EQ(left, right) \
   do { \
-    const OpId& TOKENPASTE2(_left, __LINE__) = (left); \
-    const OpId& TOKENPASTE2(_right, __LINE__) = (right); \
+    const consensus::OpId& TOKENPASTE2(_left, __LINE__) = (left); \
+    const consensus::OpId& TOKENPASTE2(_right, __LINE__) = (right); \
     if (!consensus::OpIdEquals(TOKENPASTE2(_left, __LINE__), TOKENPASTE2(_right, __LINE__))) { \
       FAIL() << "Expected: " \
             << pb_util::SecureShortDebugString(TOKENPASTE2(_left, __LINE__)) << "\n" \

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index e8be313..b151962 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -40,10 +40,12 @@
 #include "kudu/common/wire_protocol-test-util.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus-test-util.h"
 #include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
+#include "kudu/consensus/opid_util.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/gscoped_ptr.h"
@@ -72,6 +74,7 @@
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/path_util.h"
+#include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
@@ -93,12 +96,17 @@ using kudu::client::KuduTableCreator;
 using kudu::consensus::COMMITTED_OPID;
 using kudu::consensus::ConsensusMetadataManager;
 using kudu::consensus::ConsensusMetadataPB;
+using kudu::consensus::MakeOpId;
 using kudu::consensus::RaftPeerPB;
 using kudu::itest::TServerDetails;
 using kudu::itest::WaitForNumTabletServers;
+using kudu::pb_util::SecureShortDebugString;
 using kudu::tablet::TABLET_DATA_COPYING;
 using kudu::tablet::TABLET_DATA_DELETED;
+using kudu::tablet::TABLET_DATA_READY;
 using kudu::tablet::TABLET_DATA_TOMBSTONED;
+using kudu::tablet::TabletDataState;
+using kudu::tablet::TabletSuperBlockPB;
 using kudu::tserver::ListTabletsResponsePB;
 using kudu::tserver::ListTabletsResponsePB_StatusAndSchemaPB;
 using kudu::tserver::TabletCopyClient;
@@ -967,16 +975,16 @@ TEST_F(TabletCopyITest, TestTabletCopyClearsLastLoggedOpId) {
   workload.StopAndJoin();
   ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, 1));
 
-  // No last-logged opid should initially be stored in the superblock.
+  // No last-logged opid should initially be stored in the superblock on a brand-new tablet.
   tablet::TabletSuperBlockPB superblock_pb;
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
   ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid());
 
   // Now tombstone the leader.
   ASSERT_OK(itest::DeleteTablet(leader, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, kTimeout));
 
   // We should end up with a last-logged opid in the superblock.
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
   ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid());
   consensus::OpId last_logged_opid = superblock_pb.tombstone_last_logged_opid();
   ASSERT_EQ(1, last_logged_opid.term());
@@ -988,11 +996,15 @@ TEST_F(TabletCopyITest, TestTabletCopyClearsLastLoggedOpId) {
                                    cluster_->tablet_server(follower_index)->bound_rpc_hostport(),
                                    1, // We are in term 1.
                                    kTimeout));
-  ASSERT_OK(itest::WaitUntilTabletRunning(leader, tablet_id, kTimeout));
 
-  // Ensure that the last-logged opid has been cleared from the superblock.
-  inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb);
-  ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid());
+  ASSERT_EVENTUALLY([&] {
+    // Ensure that the last-logged opid has been cleared from the superblock
+    // once it persists the TABLET_DATA_READY state to disk.
+    ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(leader_index, tablet_id, &superblock_pb));
+    ASSERT_EQ(TABLET_DATA_READY, superblock_pb.tablet_data_state());
+    ASSERT_FALSE(superblock_pb.has_tombstone_last_logged_opid())
+        << SecureShortDebugString(superblock_pb);
+  });
 }
 
 // Test that the tablet copy thread pool being full results in throttling and
@@ -1097,13 +1109,24 @@ TEST_F(TabletCopyITest, TestTabletCopyThrottling) {
   LOG(INFO) << "Number of in progress responses: " << num_inprogress;
 }
 
+class TabletCopyFailureITest : public TabletCopyITest,
+                               public ::testing::WithParamInterface<const char*> {
+};
+// Trigger tablet copy failures a couple different ways: session timeout and
+// crash. We want all tablet copy attempts to fail after initial setup.
+const char* kTabletCopyFailureITestFlags[] = {
+  "--tablet_copy_early_session_timeout_prob=1.0",
+  "--tablet_copy_fault_crash_on_fetch_all=1.0"
+};
+INSTANTIATE_TEST_CASE_P(FailureCause, TabletCopyFailureITest,
+                        ::testing::ValuesIn(kTabletCopyFailureITestFlags));
+
 // Test that a failed tablet copy of a brand-new replica results in still being
 // able to vote while tombstoned.
-TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
+TEST_P(TabletCopyFailureITest, TestTabletCopyNewReplicaFailureCanVote) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
-  // We want all tablet copy attempts to fail after initial setup.
-  NO_FATALS(StartCluster({"--tablet_copy_early_session_timeout_prob=1.0"}, {},
-                         /*num_tablet_servers=*/ 4));
+  const string& failure_flag = GetParam();
+  NO_FATALS(StartCluster({failure_flag}, {}, /*num_tablet_servers=*/ 4));
   TestWorkload workload(cluster_.get());
   workload.Setup();
 
@@ -1118,10 +1141,6 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
     replica_uuids.insert(replica.ts_info().permanent_uuid());
   }
 
-  TServerDetails* leader_ts;
-  ASSERT_OK(itest::FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
-  ASSERT_OK(itest::WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, kTimeout));
-
   string new_replica_uuid;
   for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
     if (!ContainsKey(replica_uuids, cluster_->tablet_server(i)->uuid())) {
@@ -1132,9 +1151,16 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
   ASSERT_FALSE(new_replica_uuid.empty());
   auto* new_replica_ts = ts_map_[new_replica_uuid];
 
-  // Adding a server will trigger a tablet copy.
-  ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_replica_ts, RaftPeerPB::VOTER,
-                             boost::none, kTimeout));
+  // Allow retries in case the tablet leadership is unstable.
+  ASSERT_EVENTUALLY([&] {
+    TServerDetails* leader_ts;
+    ASSERT_OK(itest::FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
+    ASSERT_OK(itest::WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, kTimeout));
+
+    // Adding a server will trigger a tablet copy.
+    ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_replica_ts, RaftPeerPB::VOTER,
+                              boost::none, kTimeout));
+  });
 
   // Wait until the new replica has done its initial download, and has either
   // failed or is about to fail (the check is nondeterministic) plus has
@@ -1155,6 +1181,16 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
     // Remove the "early timeout" flag.
     cluster_->tablet_server(i)->mutable_flags()->pop_back();
   }
+
+  TabletSuperBlockPB superblock;
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(new_replica_idx, tablet_id, &superblock));
+  LOG(INFO) << "Restarting tablet servers. New replica TS UUID: " << new_replica_uuid
+            << ", tablet data state: "
+            << tablet::TabletDataState_Name(superblock.tablet_data_state())
+            << ", last-logged opid: " << superblock.tombstone_last_logged_opid();
+  ASSERT_TRUE(superblock.has_tombstone_last_logged_opid());
+  ASSERT_OPID_EQ(MakeOpId(1, 0), superblock.tombstone_last_logged_opid());
+
   ASSERT_OK(cluster_->master()->Restart());
   ASSERT_OK(cluster_->tablet_server_by_uuid(new_replica_uuid)->Restart());
   auto iter = replica_uuids.cbegin();
@@ -1169,6 +1205,85 @@ TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
   workload.StopAndJoin();
 }
 
+// Test that a failed tablet copy over a tombstoned replica retains the
+// last-logged OpId from the tombstoned replica.
+TEST_P(TabletCopyFailureITest, TestFailedTabletCopyRetainsLastLoggedOpId) {
+  MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const string& tablet_copy_failure_flag = GetParam();
+  NO_FATALS(StartCluster({"--enable_leader_failure_detection=false",
+                          tablet_copy_failure_flag},
+                         {"--catalog_manager_wait_for_new_tablets_to_elect_leader=false"}));
+
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+
+  int first_leader_index = 0;
+  TServerDetails* first_leader = ts_map_[cluster_->tablet_server(first_leader_index)->uuid()];
+
+  // Figure out the tablet id of the created tablet.
+  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(first_leader, 1, kTimeout, &tablets));
+  string tablet_id = tablets[0].tablet_status().tablet_id();
+
+  // Wait until all replicas are up and running.
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()],
+                                            tablet_id, kTimeout));
+  }
+
+  // Elect a leader for term 1, then generate some data to copy.
+  ASSERT_OK(itest::StartElection(first_leader, tablet_id, kTimeout));
+  workload.Start();
+  const int kMinBatches = 10;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GE(workload.batches_completed(), kMinBatches);
+  });
+  workload.StopAndJoin();
+  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, 1));
+
+  // Tombstone the first leader.
+  ASSERT_OK(itest::DeleteTablet(first_leader, tablet_id, TABLET_DATA_TOMBSTONED, boost::none,
+                                kTimeout));
+
+  // We should end up with a last-logged opid in the superblock of the first leader.
+  tablet::TabletSuperBlockPB superblock_pb;
+  ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(first_leader_index, tablet_id, &superblock_pb));
+  ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid());
+  consensus::OpId last_logged_opid = superblock_pb.tombstone_last_logged_opid();
+  ASSERT_EQ(1, last_logged_opid.term());
+  ASSERT_GT(last_logged_opid.index(), kMinBatches);
+  int64_t initial_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(first_leader_index, tablet_id);
+
+  // Elect a new leader.
+  int second_leader_index = 1;
+  TServerDetails* second_leader = ts_map_[cluster_->tablet_server(second_leader_index)->uuid()];
+  ASSERT_OK(itest::StartElection(second_leader, tablet_id, kTimeout));
+
+  // The second leader will initiate a tablet copy on the first leader. Wait
+  // for it to fail (via crash or abort).
+  ASSERT_EVENTUALLY([&] {
+    // Superblock must have been modified.
+    int64_t cur_mtime = inspect_->GetTabletSuperBlockMTimeOrDie(first_leader_index, tablet_id);
+    ASSERT_GT(cur_mtime, initial_mtime);
+    ASSERT_OK(inspect_->CheckTabletDataStateOnTS(first_leader_index, tablet_id,
+                                                 { TABLET_DATA_COPYING, TABLET_DATA_TOMBSTONED }));
+  });
+
+  // Bounce the cluster to get rid of leaders and reach a steady state.
+  cluster_->Shutdown();
+  ASSERT_OK(cluster_->Restart());
+
+  // After failing the tablet copy, we should retain the old tombstoned
+  // tablet's last-logged opid.
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(first_leader_index, tablet_id, &superblock_pb));
+    ASSERT_EQ(TABLET_DATA_TOMBSTONED, superblock_pb.tablet_data_state());
+    ASSERT_TRUE(superblock_pb.has_tombstone_last_logged_opid())
+        << SecureShortDebugString(superblock_pb);
+    ASSERT_OPID_EQ(last_logged_opid, superblock_pb.tombstone_last_logged_opid());
+  });
+}
+
 // This test uses CountBlocksUnderManagement() which only works with the
 // LogBlockManager.
 #ifndef __APPLE__
@@ -1185,9 +1300,9 @@ class BadTabletCopyITest : public TabletCopyITest,
 // server.
 const char* kFlagFaultOnFetch = "fault_crash_on_handle_tc_fetch_data";
 const char* kFlagEarlyTimeout = "tablet_copy_early_session_timeout_prob";
-const char* tablet_copy_failure_flags[] = { kFlagFaultOnFetch, kFlagEarlyTimeout };
+const char* kBadTabletCopyITestFlags[] = { kFlagFaultOnFetch, kFlagEarlyTimeout };
 INSTANTIATE_TEST_CASE_P(FaultFlags, BadTabletCopyITest,
-                        ::testing::ValuesIn(tablet_copy_failure_flags));
+                        ::testing::ValuesIn(kBadTabletCopyITestFlags));
 
 void BadTabletCopyITest::LoadTable(TestWorkload* workload, int min_rows, int min_blocks) {
   const MonoDelta kTimeout = MonoDelta::FromSeconds(30);

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/master/sys_catalog.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc
index b6593b8..eeb95e0 100644
--- a/src/kudu/master/sys_catalog.cc
+++ b/src/kudu/master/sys_catalog.cc
@@ -28,6 +28,7 @@
 #include <utility>
 #include <vector>
 
+#include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <google/protobuf/util/message_differencer.h>
@@ -49,6 +50,7 @@
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/consensus_peers.h"
 #include "kudu/consensus/log.h"
+#include "kudu/consensus/opid.pb.h"
 #include "kudu/consensus/opid_util.h"
 #include "kudu/consensus/quorum_util.h"
 #include "kudu/consensus/raft_consensus.h"
@@ -238,6 +240,7 @@ Status SysCatalogTable::CreateNew(FsManager *fs_manager) {
                                                   schema, partition_schema,
                                                   partitions[0],
                                                   tablet::TABLET_DATA_READY,
+                                                  /*tombstone_last_logged_opid=*/ boost::none,
                                                   &metadata));
 
   RaftConfigPB config;

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet-harness.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet-harness.h b/src/kudu/tablet/tablet-harness.h
index 5fc547c..eb62691 100644
--- a/src/kudu/tablet/tablet-harness.h
+++ b/src/kudu/tablet/tablet-harness.h
@@ -100,6 +100,7 @@ class TabletHarness {
                                                partition.first,
                                                partition.second,
                                                TABLET_DATA_READY,
+                                               /*tombstone_last_logged_opid=*/ boost::none,
                                                &metadata));
     if (options_.enable_metrics) {
       metrics_registry_.reset(new MetricRegistry());

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_bootstrap-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_bootstrap-test.cc b/src/kudu/tablet/tablet_bootstrap-test.cc
index e31574f..4af600a 100644
--- a/src/kudu/tablet/tablet_bootstrap-test.cc
+++ b/src/kudu/tablet/tablet_bootstrap-test.cc
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 
+#include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
@@ -125,6 +126,7 @@ class BootstrapTest : public LogTestBase {
                                                partition.first,
                                                partition.second,
                                                TABLET_DATA_READY,
+                                               /*tombstone_last_logged_opid=*/ boost::none,
                                                meta));
     (*meta)->SetLastDurableMrsIdForTests(mrs_id);
     if ((*meta)->GetRowSetForTests(0) != nullptr) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index d5a271f..f00de4c 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -86,6 +86,7 @@ Status TabletMetadata::CreateNew(FsManager* fs_manager,
                                  const PartitionSchema& partition_schema,
                                  const Partition& partition,
                                  const TabletDataState& initial_tablet_data_state,
+                                 boost::optional<OpId> tombstone_last_logged_opid,
                                  scoped_refptr<TabletMetadata>* metadata) {
 
   // Verify that no existing tablet exists with the same ID.
@@ -105,7 +106,8 @@ Status TabletMetadata::CreateNew(FsManager* fs_manager,
                                                        schema,
                                                        partition_schema,
                                                        partition,
-                                                       initial_tablet_data_state));
+                                                       initial_tablet_data_state,
+                                                       std::move(tombstone_last_logged_opid)));
   RETURN_NOT_OK(ret->Flush());
   dir_group_cleanup.cancel();
 
@@ -130,6 +132,7 @@ Status TabletMetadata::LoadOrCreate(FsManager* fs_manager,
                                     const PartitionSchema& partition_schema,
                                     const Partition& partition,
                                     const TabletDataState& initial_tablet_data_state,
+                                    boost::optional<OpId> tombstone_last_logged_opid,
                                     scoped_refptr<TabletMetadata>* metadata) {
   Status s = Load(fs_manager, tablet_id, metadata);
   if (s.ok()) {
@@ -139,13 +142,13 @@ Status TabletMetadata::LoadOrCreate(FsManager* fs_manager,
         schema.ToString()));
     }
     return Status::OK();
-  } else if (s.IsNotFound()) {
+  }
+  if (s.IsNotFound()) {
     return CreateNew(fs_manager, tablet_id, table_name, table_id, schema,
                      partition_schema, partition, initial_tablet_data_state,
-                     metadata);
-  } else {
-    return s;
+                     std::move(tombstone_last_logged_opid), metadata);
   }
+  return s;
 }
 
 vector<BlockIdPB> TabletMetadata::CollectBlockIdPBs(const TabletSuperBlockPB& superblock) {
@@ -266,7 +269,8 @@ TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id,
                                string table_name, string table_id,
                                const Schema& schema, PartitionSchema partition_schema,
                                Partition partition,
-                               const TabletDataState& tablet_data_state)
+                               const TabletDataState& tablet_data_state,
+                               boost::optional<OpId> tombstone_last_logged_opid)
     : state_(kNotWrittenYet),
       tablet_id_(std::move(tablet_id)),
       table_id_(std::move(table_id)),
@@ -279,6 +283,7 @@ TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id,
       table_name_(std::move(table_name)),
       partition_schema_(std::move(partition_schema)),
       tablet_data_state_(tablet_data_state),
+      tombstone_last_logged_opid_(std::move(tombstone_last_logged_opid)),
       num_flush_pins_(0),
       needs_flush_(false),
       pre_flush_callback_(Bind(DoNothingStatusClosure)) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tablet/tablet_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h
index 9c34638..e9cde54 100644
--- a/src/kudu/tablet/tablet_metadata.h
+++ b/src/kudu/tablet/tablet_metadata.h
@@ -82,6 +82,7 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> {
                           const PartitionSchema& partition_schema,
                           const Partition& partition,
                           const TabletDataState& initial_tablet_data_state,
+                          boost::optional<consensus::OpId> tombstone_last_logged_opid,
                           scoped_refptr<TabletMetadata>* metadata);
 
   // Load existing metadata from disk.
@@ -102,6 +103,7 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> {
                              const PartitionSchema& partition_schema,
                              const Partition& partition,
                              const TabletDataState& initial_tablet_data_state,
+                             boost::optional<consensus::OpId> tombstone_last_logged_opid,
                              scoped_refptr<TabletMetadata>* metadata);
 
   static std::vector<BlockIdPB> CollectBlockIdPBs(
@@ -262,13 +264,14 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> {
 
   // Constructor for creating a new tablet.
   //
-  // TODO: get rid of this many-arg constructor in favor of just passing in a
-  // SuperBlock, which already contains all of these fields.
+  // TODO(todd): get rid of this many-arg constructor in favor of just passing
+  // in a SuperBlock, which already contains all of these fields.
   TabletMetadata(FsManager* fs_manager, std::string tablet_id,
                  std::string table_name, std::string table_id,
                  const Schema& schema, PartitionSchema partition_schema,
                  Partition partition,
-                 const TabletDataState& tablet_data_state);
+                 const TabletDataState& tablet_data_state,
+                 boost::optional<consensus::OpId> tombstone_last_logged_opid);
 
   // Constructor for loading an existing tablet.
   TabletMetadata(FsManager* fs_manager, std::string tablet_id);

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tools/kudu-tool-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index b4171b0..62e7b6e 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -1021,7 +1021,9 @@ TEST_F(ToolTest, TestLocalReplicaDumpMeta) {
   scoped_refptr<TabletMetadata> meta;
   TabletMetadata::CreateNew(&fs, kTestTablet, kTestTableName, kTestTableId,
                   kSchemaWithIds, partition.first, partition.second,
-                  tablet::TABLET_DATA_READY, &meta);
+                  tablet::TABLET_DATA_READY,
+                  /*tombstone_last_logged_opid=*/ boost::none,
+                  &meta);
   string stdout;
   NO_FATALS(RunActionStdoutString(Substitute("local_replica dump meta $0 "
                                              "--fs_wal_dir=$1 "

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index af5cce7..506a840 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -301,11 +301,18 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                      copy_peer_uuid));
     }
 
+    // Retain the last-logged OpId from the previous tombstone in case this
+    // tablet copy is aborted.
+    if (last_logged_opid) {
+      *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid;
+    }
+
     // Remove any existing orphaned blocks and WALs from the tablet, and
     // set the data state to 'COPYING'.
     RETURN_NOT_OK_PREPEND(
         TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
-                                          tablet::TABLET_DATA_COPYING, boost::none),
+                                          tablet::TABLET_DATA_COPYING,
+                                          /*last_logged_opid=*/ boost::none),
         "Could not replace superblock with COPYING data state");
     CHECK_OK(fs_manager_->dd_manager()->CreateDataDirGroup(tablet_id_));
   } else {
@@ -327,7 +334,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                             schema,
                                             partition_schema,
                                             partition,
-                                            tablet::TABLET_DATA_COPYING,
+                                            superblock_->tablet_data_state(),
+                                            superblock_->tombstone_last_logged_opid(),
                                             &meta_));
   }
   CHECK(fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_,
@@ -369,6 +377,7 @@ Status TabletCopyClient::Finish() {
   LOG_WITH_PREFIX(INFO) << "Tablet Copy complete. Replacing tablet superblock.";
   SetStatusMessage("Replacing tablet superblock");
   superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY);
+  superblock_->clear_tombstone_last_logged_opid();
   RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_));
 
   if (FLAGS_tablet_copy_save_downloaded_metadata) {
@@ -397,7 +406,8 @@ Status TabletCopyClient::Abort() {
   // Delete all of the tablet data, including blocks and WALs.
   RETURN_NOT_OK_PREPEND(
       TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
-                                        tablet::TABLET_DATA_TOMBSTONED, boost::none),
+                                        tablet::TABLET_DATA_TOMBSTONED,
+                                        /*last_logged_opid=*/ boost::none),
       LogPrefix() + "Failed to tombstone tablet after aborting tablet copy");
 
   SetStatusMessage(Substitute("Tombstoned tablet $0: Tablet copy aborted", tablet_id_));

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 85055fd..8e23e61 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -290,6 +290,7 @@ Status TSTabletManager::CreateNewTablet(const string& table_id,
                               partition_schema,
                               partition,
                               TABLET_DATA_READY,
+                              boost::none,
                               &meta),
     "Couldn't create tablet metadata");
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/4f41a3cb/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 64897dc..f2a0795 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -185,6 +185,11 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
 
   // Delete the tablet using the specified delete_type as the final metadata
   // state. Deletes the on-disk data, metadata, as well as all WAL segments.
+  //
+  // If set, 'last_logged_opid' will be persisted in the
+  // 'tombstone_last_logged_opid' field in the tablet metadata. Otherwise, if
+  // 'last_logged_opid' is equal to boost::none, the tablet metadata will
+  // retain its previous value of 'tombstone_last_logged_opid', if any.
   static Status DeleteTabletData(
       const scoped_refptr<tablet::TabletMetadata>& meta,
       const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,