You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@kudu.apache.org by da...@apache.org on 2017/04/11 21:28:11 UTC

[1/2] kudu git commit: fuzz-itest: fix a test failure with scan-at-snapshot

Repository: kudu
Updated Branches:
  refs/heads/master da75b66d2 -> 87154f4a3


fuzz-itest: fix a test failure with scan-at-snapshot

Some of the logic of when to record snapshots was broken, which caused
an occasional test failure when using SCAN_AT_SNAPSHOT.

We didn't see this fail often due to a separate bug, in which we were
only scheduling SCAN_AT_SNAPSHOT at timestamps between 0 and
<num_flushes> where in fact the clock on the server was getting bumped
by every operation due to the GetRow() call invoking a server-side Scan.

This fixes both issues and adds a new test which was failing prior to
the fix.

Change-Id: Ia5a7d8ae74e2286e0e9696f79c85348965ef80ed
Reviewed-on: http://gerrit.cloudera.org:8080/6494
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <dr...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/d9174000
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/d9174000
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/d9174000

Branch: refs/heads/master
Commit: d91740006fb37eecc61d855b77f7314bdcc1025b
Parents: da75b66
Author: Todd Lipcon <to...@apache.org>
Authored: Mon Mar 27 11:03:50 2017 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Apr 11 21:26:07 2017 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/fuzz-itest.cc | 66 +++++++++++++++------------
 1 file changed, 36 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/d9174000/src/kudu/integration-tests/fuzz-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/fuzz-itest.cc b/src/kudu/integration-tests/fuzz-itest.cc
index f28a29b..47b4095 100644
--- a/src/kudu/integration-tests/fuzz-itest.cc
+++ b/src/kudu/integration-tests/fuzz-itest.cc
@@ -453,6 +453,20 @@ TestOpType PickOpAtRandom(TestOpSets sets) {
   }
 }
 
+bool IsMutation(const TestOpType& op) {
+  switch (op) {
+    case TEST_INSERT:
+    case TEST_INSERT_PK_ONLY:
+    case TEST_UPSERT:
+    case TEST_UPSERT_PK_ONLY:
+    case TEST_UPDATE:
+    case TEST_DELETE:
+      return true;
+    default:
+      return false;
+  }
+}
+
 // Generate a random valid sequence of operations for use as a
 // fuzz test.
 void GenerateTestCase(vector<TestOp>* ops, int len, TestOpSets sets = ALL) {
@@ -466,6 +480,13 @@ void GenerateTestCase(vector<TestOp>* ops, int len, TestOpSets sets = ALL) {
   while (ops->size() < len) {
     TestOpType r = PickOpAtRandom(sets);
     int row_key = rand() % FLAGS_keyspace_size;
+
+    // When we perform a test mutation, we also call GetRow() which does a scan
+    // and thus increases the server's timestamp.
+    if (IsMutation(r)) {
+      op_timestamps++;
+    }
+
     switch (r) {
       case TEST_INSERT:
       case TEST_INSERT_PK_ONLY:
@@ -610,35 +631,13 @@ void FuzzTest::RunFuzzCase(const vector<TestOp>& test_ops,
   // into a test method in order to reproduce a failure.
   LOG(INFO) << "test case:\n" << DumpTestCase(test_ops);
 
-  // Keep the vector of timestamps we'll scan at so that we save the expected state at those times.
-  vector<int> timestamps_to_scan;
-  for (const TestOp& test_op : test_ops) {
-    if (test_op.type == TEST_SCAN_AT_TIMESTAMP) {
-      timestamps_to_scan.push_back(test_op.val);
-    }
-  }
-  // Sort the scan timestamps in reverse order so that we can keep popping from the back and remove
-  // duplicates.
-  sort(timestamps_to_scan.begin(), timestamps_to_scan.end(), std::greater<int>());
-  timestamps_to_scan.erase(unique(timestamps_to_scan.begin(),
-                                  timestamps_to_scan.end()),
-                           timestamps_to_scan.end() );
-
   vector<optional<ExpectedKeyValueRow>> cur_val(FLAGS_keyspace_size);
   vector<optional<ExpectedKeyValueRow>> pending_val(FLAGS_keyspace_size);
 
   int i = 0;
   for (const TestOp& test_op : test_ops) {
-    switch (test_op.type) {
-      case TEST_INSERT:
-      case TEST_INSERT_PK_ONLY:
-      case TEST_UPSERT:
-      case TEST_UPSERT_PK_ONLY:
-      case TEST_UPDATE:
-      case TEST_DELETE:
-        EXPECT_EQ(cur_val[test_op.val], GetRow(test_op.val));
-        break;
-      default: break;
+    if (IsMutation(test_op.type)) {
+      EXPECT_EQ(cur_val[test_op.val], GetRow(test_op.val));
     }
 
     LOG(INFO) << test_op.ToString();
@@ -664,12 +663,7 @@ void FuzzTest::RunFuzzCase(const vector<TestOp>& test_ops,
         cur_val = pending_val;
         int current_time = down_cast<kudu::server::LogicalClock*>(
             tablet()->clock().get())->GetCurrentTime();
-        // Check if the next snapshot scan has a time that is higher than the current time.
-        // If it is, then store the state so that we can match it later to the scanned state.
-        if (!timestamps_to_scan.empty() && current_time >= timestamps_to_scan.back()) {
-          saved_values_[current_time] = cur_val;
-          timestamps_to_scan.pop_back();
-        }
+        saved_values_[current_time] = cur_val;
         break;
       }
       case TEST_FLUSH_TABLET:
@@ -864,6 +858,18 @@ TEST_F(FuzzTest, TestFuzz4) {
   RunFuzzCase(test_ops);
 }
 
+
+TEST_F(FuzzTest, TestFuzz5) {
+  CreateTabletAndStartClusterWithSchema(CreateKeyValueTestSchema());
+  vector<TestOp> test_ops = {
+    {TEST_UPSERT_PK_ONLY, 1},
+    {TEST_FLUSH_OPS, 0},
+    {TEST_INSERT, 0},
+    {TEST_SCAN_AT_TIMESTAMP, 5},
+  };
+  RunFuzzCase(test_ops);
+}
+
 // Previously caused incorrect data being read after restart.
 // Failure:
 //  Value of: val_in_table

[2/2] kudu git commit: [docs] Add admin workflow for recovering from disk failure

Posted by da...@apache.org.

[docs] Add admin workflow for recovering from disk failure

I didn't document how to rebalance tablets onto the repaired tserver if
necessary, since the process is complicated and error prone, and we hope
to have a rebalancing tool in the future. These docs will quickly become
outdated when KUDU-616 is fixed, but I think it's worth it to document
since we frequently receive questions on the topic.

Change-Id: I6541bffc5e9546c523df610fd8c025dd05e403bf
Reviewed-on: http://gerrit.cloudera.org:8080/6606
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Andrew Wong <aw...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/87154f4a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/87154f4a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/87154f4a

Branch: refs/heads/master
Commit: 87154f4a39c77ab92d80f3effa58de3000921127
Parents: d917400
Author: Dan Burkert <da...@apache.org>
Authored: Mon Apr 10 17:46:36 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Tue Apr 11 21:27:43 2017 +0000

----------------------------------------------------------------------
 docs/administration.adoc | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/87154f4a/docs/administration.adoc
----------------------------------------------------------------------
diff --git a/docs/administration.adoc b/docs/administration.adoc
index 7003160..813d097 100644
--- a/docs/administration.adoc
+++ b/docs/administration.adoc
@@ -585,3 +585,38 @@ be done with the following command:
 ----
 $ kudu cluster ksck --checksum_scan --tables IntegrationTestBigLinkedList master-01.example.com,master-02.example.com,master-03.example.com
 ----
+
+[[disk_failure_recovery]]
+=== Recovering from Disk Failure
+
+// TODO(dan): revise this once KUDU-616 is fixed.
+Kudu tablet servers are not resistent to disk failure. When a disk containing a
+data directory or the write-ahead log (WAL) dies, the entire tablet server must
+be rebuilt. Kudu will automatically re-replicate tablets on other servers after
+a tablet server fails, but manual intervention is needed in order to restore the
+failed tablet server to a running state.
+
+The first step to restoring a tablet server after a disk failure is to replace
+the failed disk, or remove the failed disk from the data-directory and/or WAL
+configuration. Next, the existing data directories and WAL directory must be
+removed. For example, if the tablet server is configured with
+`--fs_wal_dir=/data/0/kudu-tserver-wal` and
+`--fs_data_dirs=/data/1/kudu-tserver,/data/2/kudu-tserver`, the following
+commands will remove the existing data directories and WAL directory:
+
+[source,bash]
+----
+$ rm -rf /data/0/kudu-tserver-wal /data/1/kudu-tserver /data/2/kudu-tserver
+----
+
+After the WAL and data directories are removed, the tablet server process can be
+started. When Kudu is installed using system packages, `service` is typically
+used:
+
+[source,bash]
+----
+$ sudo service kudu-tserver start
+----
+
+Once the tablet server is running again, new tablet replicas will be created on
+it as necessary.