You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2017/11/17 23:44:19 UTC

[1/2] kudu git commit: dist_test: enable sharding of a few more tests

Repository: kudu
Updated Branches:
  refs/heads/master 64eb9f37b -> 8e6bfa9fb


dist_test: enable sharding of a few more tests

Adds sharding for rowset_tree-test, tablet_copy-itest, and
delete_tablet-itest which are the longest-running non-sharded tests in
recent builds such as [1]

[1] http://dist-test.cloudera.org/job?job_id=jenkins-slave.1510801703.18407

Change-Id: Iff1e0ac39f1834e8ac22283e1dc0c336d328ae35
Reviewed-on: http://gerrit.cloudera.org:8080/8565
Reviewed-by: Dan Burkert <da...@apache.org>
Tested-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c4006ae0
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c4006ae0
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c4006ae0

Branch: refs/heads/master
Commit: c4006ae028b44e9b7f8f006d64ed7a3a3f34fe9a
Parents: 64eb9f3
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 15 20:58:02 2017 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Fri Nov 17 21:26:12 2017 +0000

----------------------------------------------------------------------
 build-support/dist_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/c4006ae0/build-support/dist_test.py
----------------------------------------------------------------------
diff --git a/build-support/dist_test.py b/build-support/dist_test.py
index a6f97d4..78fe5cb 100755
--- a/build-support/dist_test.py
+++ b/build-support/dist_test.py
@@ -90,10 +90,13 @@ DEPS_FOR_ALL = \
 NUM_SHARDS_BY_TEST = {
   'cfile-test': 4,
   'client-test': 8,
+  'delete_table-itest': 4,
   'delete_table-test': 8,
   'flex_partitioning-itest': 8,
   'mt-tablet-test': 4,
-  'raft_consensus-itest': 6
+  'raft_consensus-itest': 6,
+  'rowset_tree-test': 6,
+  'tablet_copy-itest': 6
 }
 
 


[2/2] kudu git commit: catalog_manager_tsk-itest: ensure that test eventually makes progress

Posted by al...@apache.org.
catalog_manager_tsk-itest: ensure that test eventually makes progress

This test previously tried to introduce a lot of master leader elections
by setting a very low heartbeat and failure interval. This worked, but
sometimes worked so well that the test never made progress and couldn't
obtain a stable leader long enough to create a table.

This patch changes the test to instead use a separate thread which
triggers elections manually on all the leaders. The elections start off
very frequent and then back off as the test progresses to ensure that by
the end, the leaders do actually make progress.

I verified that this still covers the case of a failed write when
writing TSKs by changing the RETURN_NOT_OK to a CHECK_OK when storing
the TSK. With the CHECK_OK, the test failed nearly immediately.

Change-Id: I3ecda0c269225e7674bc384fee652576b110ae7b
Reviewed-on: http://gerrit.cloudera.org:8080/8567
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/8e6bfa9f
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/8e6bfa9f
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/8e6bfa9f

Branch: refs/heads/master
Commit: 8e6bfa9fb8aa7292e75a9fd3dcf9c50c8ea5191e
Parents: c4006ae
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 15 21:49:17 2017 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Fri Nov 17 23:43:13 2017 +0000

----------------------------------------------------------------------
 .../catalog_manager_tsk-itest.cc                | 66 ++++++++++++++------
 1 file changed, 46 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/8e6bfa9f/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/catalog_manager_tsk-itest.cc b/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
index 0a0a0e4..9812102 100644
--- a/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
+++ b/src/kudu/integration-tests/catalog_manager_tsk-itest.cc
@@ -16,12 +16,17 @@
 // under the License.
 
 #include <algorithm>
+#include <atomic>
+#include <cstdlib>
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <ostream>
 #include <string>
+#include <thread>
 #include <vector>
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/client/client-test-util.h"
@@ -30,11 +35,18 @@
 #include "kudu/client/shared_ptr.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/partial_row.h"
+#include "kudu/consensus/consensus.pb.h"
+#include "kudu/consensus/consensus.proxy.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/master/sys_catalog.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
 #include "kudu/tablet/key_value_test_schema.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/net/sockaddr.h"
+#include "kudu/util/scoped_cleanup.h"
+#include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
@@ -47,6 +59,7 @@ using kudu::client::KuduTable;
 using kudu::client::KuduTableCreator;
 using kudu::cluster::ExternalMiniCluster;
 using kudu::cluster::ExternalMiniClusterOptions;
+using std::atomic;
 using std::back_inserter;
 using std::copy;
 using std::string;
@@ -74,22 +87,10 @@ class CatalogManagerTskITest : public KuduTest {
     cluster_opts_.master_rpc_ports = { 11030, 11031, 11032 };
     cluster_opts_.num_tablet_servers = num_tservers_;
 
-    // Add common flags for both masters and tservers.
-    const vector<string> common_flags = {
-      Substitute("--raft_heartbeat_interval_ms=$0", hb_interval_ms_),
-    };
-    copy(common_flags.begin(), common_flags.end(),
-        back_inserter(cluster_opts_.extra_master_flags));
-    copy(common_flags.begin(), common_flags.end(),
-        back_inserter(cluster_opts_.extra_tserver_flags));
-
     // Add master-only flags.
     const vector<string> master_flags = {
       "--catalog_manager_inject_latency_prior_tsk_write_ms=1000",
       "--raft_enable_pre_election=false",
-      Substitute("--leader_failure_exp_backoff_max_delta_ms=$0",
-          hb_interval_ms_ * 4),
-      "--leader_failure_max_missed_heartbeat_periods=1.0",
       "--master_non_leader_masters_propagate_tsk",
       "--tsk_rotation_seconds=2",
     };
@@ -113,7 +114,7 @@ class CatalogManagerTskITest : public KuduTest {
     using ::kudu::client::sp::shared_ptr;
     static const char* kTableName = "test-table";
     // Using the setting for both RPC and admin operation timeout.
-    const MonoDelta timeout = MonoDelta::FromSeconds(600);
+    const MonoDelta timeout = MonoDelta::FromSeconds(120);
     KuduClientBuilder builder;
     builder.default_admin_operation_timeout(timeout).default_rpc_timeout(timeout);
     shared_ptr<KuduClient> client;
@@ -157,21 +158,46 @@ class CatalogManagerTskITest : public KuduTest {
 
 // Check that master servers do not crash on change of leadership while
 // writing newly generated TSKs. The leadership changes are provoked
-// by the injected latency just after generating a TSK but prior to writing it
-// into the system table: setting --leader_failure_max_missed_heartbeat_periods
-// flag to just one heartbeat period and unsetting --raft_enable_pre_election
-// gives high chances of re-election to happen while current leader has blocked
-// its leadership-related activity.
+// by a separate thread which just forces each leader to call elections
+// in turn, separated by random sleeps.
 TEST_F(CatalogManagerTskITest, LeadershipChangeOnTskGeneration) {
   NO_FATALS(StartCluster());
 
+  std::atomic<bool> done { false };
+  std::thread t([&]() {
+      // At the start of the test, cause leader elections rapidly,
+      // but then space them out further and further as the test goes
+      // to ensure that we eventually do get a successful run.
+      double max_sleep_ms = 5;
+      while (!done) {
+        for (int i = 0; i < cluster_->num_masters() && !done; i++) {
+          LOG(INFO) << "Attempting to promote master " << i << " to leader";
+          consensus::ConsensusServiceProxy proxy(
+              cluster_->messenger(), cluster_->master(i)->bound_rpc_addr(), "master");
+          consensus::RunLeaderElectionRequestPB req;
+          consensus::RunLeaderElectionResponsePB resp;
+          rpc::RpcController rpc;
+          req.set_tablet_id(master::SysCatalogTable::kSysCatalogTabletId);
+          req.set_dest_uuid(cluster_->master(i)->uuid());
+          rpc.set_timeout(MonoDelta::FromSeconds(10));
+          WARN_NOT_OK(proxy.RunLeaderElection(req, &resp, &rpc),
+                      "couldn't promote new leader");
+          int s = rand() % static_cast<int>(max_sleep_ms);
+          LOG(INFO) << "Sleeping for " << s;
+          SleepFor(MonoDelta::FromMilliseconds(s));
+          max_sleep_ms = std::min(max_sleep_ms * 1.1, 3000.0);
+        }
+      }
+    });
+  SCOPED_CLEANUP({ done = true; t.join(); });
+
   const MonoTime t_stop = MonoTime::Now() +
       MonoDelta::FromSeconds(run_time_seconds_);
   while (MonoTime::Now() < t_stop) {
     NO_FATALS(SmokeTestCluster());
+    NO_FATALS(cluster_->AssertNoCrashes());
   }
-
-  NO_FATALS(cluster_->AssertNoCrashes());
+  LOG(INFO) << "Done. Waiting on elector thread.";
 }
 
 } // namespace master