You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/01/16 03:44:37 UTC

incubator-kudu git commit: Fix some TSAN failures seen on GCE

Repository: incubator-kudu
Updated Branches:
  refs/heads/master 79130fc0a -> 56f97d557


Fix some TSAN failures seen on GCE

This fixes a few TSAN issues that I've seen while testing on GCE:

- Add suppression for epoll_ctl

libev is missing TSAN annotations and thus its synchronization doesn't
get registered as "happens-before" relationships. This can cause spurious
TSAN errors relating to epoll file descriptors. Before this suppression,
I saw 8/100 runs of DeleteTableTest TSAN fail with such a race.

- Make churny election test a little less churny on TSAN

The TestChurnyElections test sets timeouts to 1ms. On TSAN on GCE instances
this is often failing because it doesn't make any progress at all. This
patch sets the election timeout to 5ms instead of 1ms and also disables
fsync, which is a bit slower on GCE due to storage being remote.

- Increase timeout waiting for WALs in some integration tests

This fixes some flakiness in delete_table-test TSAN builds where writes
were going slowly enough that we didn't reach the desired number of WALs
within 30 seconds. After doubling to 60 seconds, I didn't see failures.

- Bump test workload write timeout to 50ms in memory limit test

With a 10ms timeout, the client was often unable to even connect to the
server to start any transactions in TSAN builds. Bumping to 50ms makes
the test much more reliable.

- Add TSAN annotations around iterator stats

We assume that iterator stats can be fuzzy and don't want to add barriers
on the write side. This patch adds an IGNORE annotation around reading
the stats.

Change-Id: Idb7447b9b88c37a45be3840d5b24edafc237ad51
Reviewed-on: http://gerrit.cloudera.org:8080/1793
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/56f97d55
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/56f97d55
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/56f97d55

Branch: refs/heads/master
Commit: 56f97d5577c2d7f7c6b9715aff250a0ca8d4fb1e
Parents: 79130fc
Author: Todd Lipcon <to...@apache.org>
Authored: Thu Jan 14 17:21:55 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Sat Jan 16 02:44:19 2016 +0000

----------------------------------------------------------------------
 build-support/tsan-suppressions.txt                         | 1 +
 .../integration-tests/external_mini_cluster_fs_inspector.h  | 2 +-
 src/kudu/integration-tests/raft_consensus-itest.cc          | 9 ++++++++-
 src/kudu/tablet/cfile_set.cc                                | 3 +++
 4 files changed, 13 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/56f97d55/build-support/tsan-suppressions.txt
----------------------------------------------------------------------
diff --git a/build-support/tsan-suppressions.txt b/build-support/tsan-suppressions.txt
index a833cd9..a39c9a8 100644
--- a/build-support/tsan-suppressions.txt
+++ b/build-support/tsan-suppressions.txt
@@ -47,6 +47,7 @@ race:_ULx86_64_init
 # for examples.
 race:evpipe_write
 race:evpipe_init
+race:epoll_ctl
 
 # concurrent btree uses optimistic concurrency, needs to be annotated a bunch
 # more before it would pass. Relatively confident that it is correct based on

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/56f97d55/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
index 1ccfc5f..2bf3023 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
@@ -73,7 +73,7 @@ class ExternalMiniClusterFsInspector {
   Status WaitForMinFilesInTabletWalDirOnTS(int index,
                                            const std::string& tablet_id,
                                            int count,
-                                           const MonoDelta& timeout = MonoDelta::FromSeconds(30));
+                                           const MonoDelta& timeout = MonoDelta::FromSeconds(60));
   Status WaitForReplicaCount(int expected, const MonoDelta& timeout = MonoDelta::FromSeconds(30));
   Status WaitForTabletDataStateOnTS(int index,
                                     const std::string& tablet_id,

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/56f97d55/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 8496ddb..b7ed3b3 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -913,9 +913,16 @@ TEST_F(RaftConsensusITest, DISABLED_TestChurnyElections_WithNotificationLatency)
 void RaftConsensusITest::DoTestChurnyElections(bool with_latency) {
   vector<string> ts_flags, master_flags;
 
+#ifdef THREAD_SANITIZER
+  // On TSAN builds, we need to be a little bit less churny in order to make
+  // any progress at all.
+  ts_flags.push_back("--raft_heartbeat_interval_ms=5");
+#else
   ts_flags.push_back("--raft_heartbeat_interval_ms=1");
+#endif
   ts_flags.push_back("--leader_failure_monitor_check_mean_ms=1");
   ts_flags.push_back("--leader_failure_monitor_check_stddev_ms=1");
+  ts_flags.push_back("--never_fsync");
   if (with_latency) {
     ts_flags.push_back("--consensus_inject_latency_ms_in_notifications=50");
   }
@@ -2221,7 +2228,7 @@ TEST_F(RaftConsensusITest, TestMemoryRemainsConstantDespiteTwoDeadFollowers) {
   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.set_timeout_allowed(true);
-  workload.set_write_timeout_millis(10);
+  workload.set_write_timeout_millis(50);
   workload.Setup();
   workload.Start();
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/56f97d55/src/kudu/tablet/cfile_set.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/cfile_set.cc b/src/kudu/tablet/cfile_set.cc
index 121dd70..91dffc0 100644
--- a/src/kudu/tablet/cfile_set.cc
+++ b/src/kudu/tablet/cfile_set.cc
@@ -24,6 +24,7 @@
 #include "kudu/cfile/cfile_writer.h"
 #include "kudu/common/scan_spec.h"
 #include "kudu/gutil/algorithm.h"
+#include "kudu/gutil/dynamic_annotations.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
@@ -476,7 +477,9 @@ void CFileSet::Iterator::GetIteratorStats(vector<IteratorStats>* stats) const {
   stats->clear();
   stats->reserve(col_iters_.size());
   for (const ColumnIterator* iter : col_iters_) {
+    ANNOTATE_IGNORE_READS_BEGIN();
     stats->push_back(iter->io_statistics());
+    ANNOTATE_IGNORE_READS_END();
   }
 }