You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2016/09/09 21:25:58 UTC

[2/2] kudu git commit: KUDU-1500: Data race in RaftConsensusITest.TestCorruptReplicaMetadata

KUDU-1500: Data race in RaftConsensusITest.TestCorruptReplicaMetadata

The test intends to corrupt the metadata of one of the tserver tablets.
While the cluster is in the process of resurrecting the failed tserver,
the Partition and PartitionSchema on that tserver is accessed using
TabletPeer in an unguarded manner via ListTablets RPCs.

The proposed fix here keeps the Partition and PartitionSchema immutable
after it is loaded for the very first time. These fields are never
overwritten again during tablet-copy or any other workflow.
Also a unit test is added to exercise this data race window. i.e,
the unit test aims to issue ListTablets RPCs during a follower's
tablet copy stage and provides an optimistic coverage for the fix.
DCHECKs are added to make sure the oncoming protobuf fields
match the immutable fields not updated during tablet-copy.

Testing: Spun a test TabletCopyITest.TestDeleteTabletDuringTabletCopy
which exercises this data race window, also ran original test which
caught the race in raft_consensus-itest.TestCorruptReplicaMetadata.

Change-Id: If57617e22b41296b8d4e8ad131220f1ebb235019
Reviewed-on: http://gerrit.cloudera.org:8080/3823
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c386f73a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c386f73a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c386f73a

Branch: refs/heads/master
Commit: c386f73a979c06eea0a9944ce8544c7d1dcb566a
Parents: 7eaeb6d
Author: Dinesh Bhat <di...@cloudera.com>
Authored: Fri Aug 12 01:40:25 2016 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Fri Sep 9 21:25:29 2016 +0000

----------------------------------------------------------------------
 src/kudu/common/partition.cc                    |  8 ++
 src/kudu/common/partition.h                     |  3 +
 src/kudu/integration-tests/tablet_copy-itest.cc | 94 ++++++++++++++++++++
 src/kudu/tablet/tablet_metadata.cc              | 40 +++++----
 4 files changed, 129 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/c386f73a/src/kudu/common/partition.cc
----------------------------------------------------------------------
diff --git a/src/kudu/common/partition.cc b/src/kudu/common/partition.cc
index 6a9cf81..5b47a36 100644
--- a/src/kudu/common/partition.cc
+++ b/src/kudu/common/partition.cc
@@ -64,6 +64,14 @@ Slice Partition::range_key(const string& partition_key) const {
   }
 }
 
+bool Partition::Equals(const Partition& other) const {
+  if (this == &other) return true;
+  if (partition_key_start().compare(other.partition_key_start()) != 0) return false;
+  if (partition_key_end().compare(other.partition_key_end()) != 0) return false;
+  if (hash_buckets_ != other.hash_buckets_) return false;
+  return true;
+}
+
 void Partition::ToPB(PartitionPB* pb) const {
   pb->Clear();
   pb->mutable_hash_buckets()->Reserve(hash_buckets_.size());

http://git-wip-us.apache.org/repos/asf/kudu/blob/c386f73a/src/kudu/common/partition.h
----------------------------------------------------------------------
diff --git a/src/kudu/common/partition.h b/src/kudu/common/partition.h
index 41a6c24..bd88978 100644
--- a/src/kudu/common/partition.h
+++ b/src/kudu/common/partition.h
@@ -66,6 +66,9 @@ class Partition {
     return partition_key_end_;
   }
 
+  // Returns true if the other partition is equivalent to this one.
+  bool Equals(const Partition& other) const;
+
   // Serializes a partition into a protobuf message.
   void ToPB(PartitionPB* pb) const;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/c386f73a/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index 3d85aa6..2a76d10 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -15,10 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <atomic>
 #include <boost/optional.hpp>
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <string>
+#include <thread>
 #include <unordered_map>
 
 #include "kudu/client/client-test-util.h"
@@ -35,10 +37,13 @@
 #include "kudu/tablet/tablet_bootstrap.h"
 #include "kudu/tablet/tablet_metadata.h"
 #include "kudu/tserver/tablet_copy_client.h"
+#include "kudu/util/barrier.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/pstack_watcher.h"
 #include "kudu/util/test_util.h"
 
+DEFINE_int32(test_num_threads, 16,
+             "Number of test threads to launch");
 DEFINE_int32(test_delete_leader_num_iters, 3,
              "Number of iterations to run in TestDeleteLeaderDuringTabletCopyStressTest.");
 DEFINE_int32(test_delete_leader_min_rows_per_iter, 20,
@@ -247,6 +252,95 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) {
   }
 }
 
+// Perform ListTablets on a peer while peer is going through tablet copy.
+// This serves as a unit test for a tsan data race fix (KUDU-1500).
+// This test is only optimistic in that, we hope to catch the
+// state transition on the follower during the tablet-copy and thereby
+// exercising the data races observed by tsan earlier.
+//
+// Step 1: Start the cluster and pump some workload.
+// Step 2: Tombstone the tablet on a selected follower.
+// Step 3: Create some threads which issue ListTablets to that follower
+//         in a loop until the follower finishes tablet copy.
+TEST_F(TabletCopyITest, TestListTabletsDuringTabletCopy) {
+  MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const int kTsIndex = 1; // Pick a random TS.
+  NO_FATALS(StartCluster());
+
+  // Populate a tablet with some data.
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+  workload.Start();
+  while (workload.rows_inserted() < 1000) {
+    SleepFor(MonoDelta::FromMilliseconds(10));
+  }
+
+  TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()];
+  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &tablets));
+  string tablet_id = tablets[0].tablet_status().tablet_id();
+
+  // Ensure all the servers agree before we proceed.
+  workload.StopAndJoin();
+  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id,
+                                  workload.batches_completed()));
+
+  int leader_index;
+  int follower_index;
+  TServerDetails* leader_ts;
+  TServerDetails* follower_ts;
+
+  // Find out who is leader so that we can tombstone a follower
+  // and let the tablet copy be kicked off by the leader.
+  // We could have tombstoned the leader too and let the election ensue
+  // followed by tablet copy of the tombstoned replica. But, we can keep
+  // the test more focused this way by not triggering any election thereby
+  // reducing overall test time as well.
+  ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
+  leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid());
+  follower_index = (leader_index + 1) % cluster_->num_tablet_servers();
+  follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()];
+
+  vector<std::thread> threads;
+  std::atomic<bool> finish(false);
+  Barrier barrier(FLAGS_test_num_threads + 1); // include main thread
+
+  // Threads to issue ListTablets RPCs to follower.
+  for (int i = 0; i < FLAGS_test_num_threads; i++) {
+    threads.emplace_back([&]() {
+      barrier.Wait();
+      do {
+        vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+        CHECK_OK(ListTablets(ts, MonoDelta::FromSeconds(30), &tablets));
+      } while (!finish.load(std::memory_order_relaxed));
+    });
+  }
+
+  // Wait until all ListTablets RPC threads are fired up.
+  barrier.Wait();
+
+  // Tombstone the tablet on the follower.
+  LOG(INFO) << "Tombstoning follower tablet " << tablet_id
+            << " on TS " << follower_ts->uuid();
+  ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id,
+                                TABLET_DATA_TOMBSTONED,
+                                boost::none, kTimeout));
+
+  // A new good copy should automatically get created via tablet copy.
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+      follower_index, tablet_id,
+      { tablet::TABLET_DATA_READY },
+      kTimeout));
+
+  // Wait for all threads to finish.
+  finish.store(true, std::memory_order_relaxed);
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  NO_FATALS(cluster_->AssertNoCrashes());
+}
+
 // Start tablet copy session and delete the tablet in the middle.
 // It should actually be possible to complete copying in such a case, because
 // when a Tablet Copy session is started on the "source" server, all of

http://git-wip-us.apache.org/repos/asf/kudu/blob/c386f73a/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 71ec4a3..d3599df 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -280,7 +280,6 @@ Status TabletMetadata::LoadFromSuperBlock(const TabletSuperBlockPB& superblock)
                                 superblock.DebugString());
     }
 
-    table_id_ = superblock.table_id();
     last_durable_mrs_id_ = superblock.last_durable_mrs_id();
 
     table_name_ = superblock.table_name();
@@ -292,25 +291,34 @@ Status TabletMetadata::LoadFromSuperBlock(const TabletSuperBlockPB& superblock)
                           superblock.ShortDebugString());
     SetSchemaUnlocked(std::move(schema), schema_version);
 
-    // This check provides backwards compatibility with the
-    // flexible-partitioning changes introduced in KUDU-818.
-    if (superblock.has_partition()) {
+    if (!superblock.has_partition()) {
+      // KUDU-818: Possible backward compatibility issue with tables created
+      // with version <= 0.5, throw warning.
+      LOG_WITH_PREFIX(WARNING) << "Upgrading from Kudu 0.5.0 directly to this"
+          << " version is not supported. Please upgrade to 0.6.0 before"
+          << " moving to a higher version.";
+      return Status::NotFound("Missing partition in superblock "+
+                              superblock.DebugString());
+    }
+
+    // Some metadata fields are assumed to be immutable and thus are
+    // only read from the protobuf when the tablet metadata is loaded
+    // for the very first time. See KUDU-1500 for more details.
+    if (state_ == kNotLoadedYet) {
+      table_id_ = superblock.table_id();
       RETURN_NOT_OK(PartitionSchema::FromPB(superblock.partition_schema(),
                                             *schema_, &partition_schema_));
       Partition::FromPB(superblock.partition(), &partition_);
     } else {
-      // This clause may be removed after compatibility with tables created
-      // before KUDU-818 is not needed.
-      RETURN_NOT_OK(PartitionSchema::FromPB(PartitionSchemaPB(), *schema_, &partition_schema_));
-      PartitionPB partition;
-      if (!superblock.has_start_key() || !superblock.has_end_key()) {
-        return Status::Corruption(
-            "tablet superblock must contain either a partition or start and end primary keys",
-            superblock.ShortDebugString());
-      }
-      partition.set_partition_key_start(superblock.start_key());
-      partition.set_partition_key_end(superblock.end_key());
-      Partition::FromPB(partition, &partition_);
+      CHECK_EQ(table_id_, superblock.table_id());
+      PartitionSchema partition_schema;
+      RETURN_NOT_OK(PartitionSchema::FromPB(superblock.partition_schema(),
+                                            *schema_, &partition_schema));
+      CHECK(partition_schema_.Equals(partition_schema));
+
+      Partition partition;
+      Partition::FromPB(superblock.partition(), &partition);
+      CHECK(partition_.Equals(partition));
     }
 
     tablet_data_state_ = superblock.tablet_data_state();