You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/12/01 01:15:04 UTC

[1/9] kudu git commit: KUDU-1545. Fix crash when visiting tablet page for tombstoned tablet

Repository: kudu
Updated Branches:
  refs/heads/master c553a9a59 -> b39491de6


KUDU-1545. Fix crash when visiting tablet page for tombstoned tablet

This page was previously accessing peer->consensus() which could be NULL
in a tombstoned tablet.

Change-Id: I7d53d2080c476fe02e2c4ab5482fe80387199377
Reviewed-on: http://gerrit.cloudera.org:8080/5280
Reviewed-by: Dan Burkert <da...@apache.org>
Reviewed-by: Will Berkeley <wd...@gmail.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/885d2e90
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/885d2e90
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/885d2e90

Branch: refs/heads/master
Commit: 885d2e90a8c9e5ceb8ff330478a8b08c7ba3b2a0
Parents: c553a9a
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 30 12:04:09 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Nov 30 20:50:26 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/delete_table-test.cc | 45 +++++++++++++++++++-
 src/kudu/tserver/tserver-path-handlers.cc       |  9 +++-
 2 files changed, 50 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/885d2e90/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index 08b48c3..33d2e51 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -269,8 +269,8 @@ TEST_F(DeleteTableTest, TestDeleteEmptyTable) {
   ASSERT_EQ("{\"tables\":[],\"tablets\":[]}", entities_buf.ToString());
 }
 
-// Test that a DeleteTable RPC is rejected without a matching destination UUID.
-TEST_F(DeleteTableTest, TestDeleteTableDestUuidValidation) {
+// Test that a DeleteTablet RPC is rejected without a matching destination UUID.
+TEST_F(DeleteTableTest, TestDeleteTabletDestUuidValidation) {
   NO_FATALS(StartCluster());
   // Create a table on the cluster. We're just using TestWorkload
   // as a convenient way to create it.
@@ -970,6 +970,47 @@ TEST_F(DeleteTableTest, TestFDsNotLeakedOnTabletTombstone) {
   ASSERT_EQ(0, PrintOpenTabletFiles(ets->pid(), tablet_id));
 }
 
+// Regression test for KUDU-1545: crash when visiting the tablet page for a
+// tombstoned tablet.
+TEST_F(DeleteTableTest, TestWebPageForTombstonedTablet) {
+  const MonoDelta timeout = MonoDelta::FromSeconds(30);
+
+  NO_FATALS(StartCluster({}, {}, 1));
+
+  // Create the table.
+  TestWorkload workload(cluster_.get());
+  workload.set_num_replicas(1);
+  workload.Setup();
+
+  // Figure out the tablet id of the created tablet.
+  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(ts_map_.begin()->second, 1, timeout, &tablets));
+  const string& tablet_id = tablets[0].tablet_status().tablet_id();
+
+  // Tombstone the tablet.
+  ExternalTabletServer* ets = cluster_->tablet_server(0);
+  ASSERT_OK(itest::DeleteTablet(ts_map_[ets->uuid()],
+                                tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout));
+
+  // Check the various web pages associated with the tablet, ensuring
+  // they don't crash and at least have the tablet ID within them.
+  EasyCurl c;
+  const auto& pages = { "tablet",
+                        "tablet-rowsetlayout-svg",
+                        "tablet-consensus-status",
+                        "log-anchors" };
+  for (const auto& page : pages) {
+    faststring buf;
+    ASSERT_OK(c.FetchURL(Substitute(
+        "http://$0/$1?id=$2",
+        cluster_->tablet_server(0)->bound_http_hostport().ToString(),
+        page,
+        tablet_id), &buf));
+    ASSERT_STR_CONTAINS(buf.ToString(), tablet_id);
+  }
+}
+
+
 TEST_F(DeleteTableTest, TestUnknownTabletsAreNotDeleted) {
   // Speed up heartbeating so that the unknown tablet is detected faster.
   vector<string> extra_ts_flags = { "--heartbeat_interval_ms=10" };

http://git-wip-us.apache.org/repos/asf/kudu/blob/885d2e90/src/kudu/tserver/tserver-path-handlers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tserver-path-handlers.cc b/src/kudu/tserver/tserver-path-handlers.cc
index dc671cc..95e42c8 100644
--- a/src/kudu/tserver/tserver-path-handlers.cc
+++ b/src/kudu/tserver/tserver-path-handlers.cc
@@ -343,10 +343,15 @@ void TabletServerPathHandlers::HandleTabletPage(const Webserver::WebRequest& req
   if (!LoadTablet(tserver_, req, &tablet_id, &peer, output)) return;
 
   string table_name = peer->tablet_metadata()->table_name();
-  RaftPeerPB::Role role = peer->consensus()->role();
+  RaftPeerPB::Role role = RaftPeerPB::UNKNOWN_ROLE;
+  auto consensus = peer->consensus();
+  if (consensus) {
+    role = consensus->role();
+  }
 
   *output << "<h1>Tablet " << EscapeForHtmlToString(tablet_id)
-          << " (" << RaftPeerPB::Role_Name(role) << ")</h1>\n";
+          << " (" << peer->HumanReadableState()
+          << "/" << RaftPeerPB::Role_Name(role) << ")</h1>\n";
   *output << "<h3>Table " << EscapeForHtmlToString(table_name) << "</h3>";
 
   // Output schema in tabular format.


[9/9] kudu git commit: KUDU-1775 (part 1). Reject CREATE TABLE with negative or too-high replication factors

Posted by to...@apache.org.
KUDU-1775 (part 1). Reject CREATE TABLE with negative or too-high replication factors

This adds more master-side checking of the replication factor passed
from the client:

* must be non-negative. Previously passing a negative replication factor
  would result in infinite retries trying to assign replicas.

* put an upper bound of 7, configurable with a new unsafe flag. We
  haven't tested higher replication factors at all, and a
  well-intentioned user may try to set replication factor to a very high
  number to get a table replicated onto every server in their cluster.
  This is common practice in MPP RDBMS, but since we haven't tested it,
  we shouldn't allow it without making the user turn on
  --unlock_unsafe_flags.

Change-Id: I83fedc7cb9722c049c20eb7766605fbb2f966347
Reviewed-on: http://gerrit.cloudera.org:8080/5289
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b39491de
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b39491de
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b39491de

Branch: refs/heads/master
Commit: b39491de611eb7a36be46816c66fa3a85590a656
Parents: e4f7e92
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 30 14:23:20 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Thu Dec 1 01:14:03 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/client-test.cc            | 50 +++++++++++---------------
 src/kudu/client/client.cc                 |  5 +--
 src/kudu/client/table_creator-internal.cc |  1 -
 src/kudu/client/table_creator-internal.h  |  3 +-
 src/kudu/master/catalog_manager.cc        | 23 +++++++++++-
 src/kudu/master/master.proto              |  5 ++-
 6 files changed, 52 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index c52cd76..77aa1a1 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -3760,24 +3760,27 @@ TEST_F(ClientTest, TestCreateTableWithTooManyTablets) {
                       "The requested number of tablets is over the permitted maximum (1)");
 }
 
-TEST_F(ClientTest, TestCreateTableWithTooManyReplicas) {
-  KuduPartialRow* split1 = schema_.NewRow();
-  ASSERT_OK(split1->SetInt32("key", 1));
-
-  KuduPartialRow* split2 = schema_.NewRow();
-  ASSERT_OK(split2->SetInt32("key", 2));
+// Tests for too many replicas, too few replicas, even replica count, etc.
+TEST_F(ClientTest, TestCreateTableWithBadNumReplicas) {
+  const vector<pair<int, string>> cases = {
+    {3, "Not enough live tablet servers to create a table with the requested "
+     "replication factor 3. 1 tablet servers are alive"},
+    {2, "illegal replication factor 2 (replication factor must be odd)"},
+    {-1, "illegal replication factor -1 (replication factor must be positive)"},
+    {11, "illegal replication factor 11 (max replication factor is 7)"}
+  };
 
-  gscoped_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
-  Status s = table_creator->table_name("foobar")
-      .schema(&schema_)
-      .set_range_partition_columns({ "key" })
-      .split_rows({ split1, split2 })
-      .num_replicas(3)
-      .Create();
-  ASSERT_TRUE(s.IsInvalidArgument());
-  ASSERT_STR_CONTAINS(s.ToString(),
-                      "Not enough live tablet servers to create a table with the requested "
-                      "replication factor 3. 1 tablet servers are alive");
+  for (const auto& c : cases) {
+    SCOPED_TRACE(Substitute("num_replicas=$0", c.first));
+    gscoped_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
+    Status s = table_creator->table_name("foobar")
+        .schema(&schema_)
+        .set_range_partition_columns({ "key" })
+        .num_replicas(c.first)
+        .Create();
+    EXPECT_TRUE(s.IsInvalidArgument());
+    ASSERT_STR_CONTAINS(s.ToString(), c.second);
+  }
 }
 
 TEST_F(ClientTest, TestCreateTableWithInvalidEncodings) {
@@ -4182,7 +4185,7 @@ TEST_F(ClientTest, TestBatchScanConstIterator) {
 }
 
 TEST_F(ClientTest, TestTableNumReplicas) {
-  for (int i : { 1, 3, 5, 7, 9 }) {
+  for (int i : { 1, 3, 5, 7 }) {
     shared_ptr<KuduTable> table;
     NO_FATALS(CreateTable(Substitute("table_with_$0_replicas", i),
                           i, {}, {}, &table));
@@ -4227,16 +4230,5 @@ TEST_F(ClientTest, TestGetTablet) {
   }
 }
 
-// Test create table with even replicas factor should fail.
-TEST_F(ClientTest, TestTableWithEvenReplicas) {
-  gscoped_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
-  Status s = table_creator->table_name("table_with_even_replicas")
-                          .schema(&schema_)
-                          .num_replicas(2)
-                          .set_range_partition_columns({ "key" })
-                          .Create();
-  ASSERT_TRUE(s.IsInvalidArgument());
-  ASSERT_STR_CONTAINS(s.ToString(), "Illegal replication factor");
-}
 } // namespace client
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/client/client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc
index f700efc..974b4be 100644
--- a/src/kudu/client/client.cc
+++ b/src/kudu/client/client.cc
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include <boost/bind.hpp>
+#include <boost/optional.hpp>
 
 #include "kudu/client/batcher.h"
 #include "kudu/client/callbacks.h"
@@ -588,8 +589,8 @@ Status KuduTableCreator::Create() {
   // Build request.
   CreateTableRequestPB req;
   req.set_name(data_->table_name_);
-  if (data_->num_replicas_ >= 1) {
-    req.set_num_replicas(data_->num_replicas_);
+  if (data_->num_replicas_ != boost::none) {
+    req.set_num_replicas(data_->num_replicas_.get());
   }
   RETURN_NOT_OK_PREPEND(SchemaToPB(*data_->schema_->schema_, req.mutable_schema(),
                                    SCHEMA_PB_WITHOUT_WRITE_DEFAULT),

http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/client/table_creator-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/table_creator-internal.cc b/src/kudu/client/table_creator-internal.cc
index 36311a9..118919c 100644
--- a/src/kudu/client/table_creator-internal.cc
+++ b/src/kudu/client/table_creator-internal.cc
@@ -27,7 +27,6 @@ namespace client {
 KuduTableCreator::Data::Data(KuduClient* client)
   : client_(client),
     schema_(nullptr),
-    num_replicas_(0),
     wait_(true) {
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/client/table_creator-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/table_creator-internal.h b/src/kudu/client/table_creator-internal.h
index 7a6753d..57f46dc 100644
--- a/src/kudu/client/table_creator-internal.h
+++ b/src/kudu/client/table_creator-internal.h
@@ -17,6 +17,7 @@
 #ifndef KUDU_CLIENT_TABLE_CREATOR_INTERNAL_H
 #define KUDU_CLIENT_TABLE_CREATOR_INTERNAL_H
 
+#include <boost/optional.hpp>
 #include <memory>
 #include <string>
 #include <utility>
@@ -53,7 +54,7 @@ class KuduTableCreator::Data {
 
   PartitionSchemaPB partition_schema_;
 
-  int num_replicas_;
+  boost::optional<int> num_replicas_;
 
   MonoDelta timeout_;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 67be524..d5af69c 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -116,6 +116,11 @@ DEFINE_int32(default_num_replicas, 3,
              "Default number of replicas for tables that do not have the num_replicas set.");
 TAG_FLAG(default_num_replicas, advanced);
 
+DEFINE_int32(max_num_replicas, 7,
+             "Maximum number of replicas that may be specified for a table.");
+// Tag as unsafe since we have done very limited testing of higher than 5 replicas.
+TAG_FLAG(max_num_replicas, unsafe);
+
 DEFINE_bool(allow_unsafe_replication_factor, false,
             "Allow creating tables with even replication factor.");
 TAG_FLAG(allow_unsafe_replication_factor, unsafe);
@@ -933,11 +938,27 @@ Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
   // Reject create table with even replication factors, unless master flag
   // allow_unsafe_replication_factor is on.
   if (req.num_replicas() % 2 == 0 && !FLAGS_allow_unsafe_replication_factor) {
-    s = Status::InvalidArgument(Substitute("Illegal replication factor $0 (replication "
+    s = Status::InvalidArgument(Substitute("illegal replication factor $0 (replication "
                                            "factor must be odd)", req.num_replicas()));
     return SetError(MasterErrorPB::EVEN_REPLICATION_FACTOR, s);
   }
 
+  if (req.num_replicas() > FLAGS_max_num_replicas) {
+    s = Status::InvalidArgument(Substitute("illegal replication factor $0 (max replication "
+                                           "factor is $1)",
+                                           req.num_replicas(),
+                                           FLAGS_max_num_replicas));
+    return SetError(MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
+
+  }
+  if (req.num_replicas() <= 0) {
+    s = Status::InvalidArgument(Substitute("illegal replication factor $0 (replication factor "
+                                           "must be positive)",
+                                           req.num_replicas(),
+                                           FLAGS_max_num_replicas));
+    return SetError(MasterErrorPB::ILLEGAL_REPLICATION_FACTOR, s);
+  }
+
   // Verify that the total number of tablets is reasonable, relative to the number
   // of live tablet servers.
   TSDescriptorVector ts_descs;

http://git-wip-us.apache.org/repos/asf/kudu/blob/b39491de/src/kudu/master/master.proto
----------------------------------------------------------------------
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 17a263d..fb02dc1 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -58,7 +58,7 @@ message MasterErrorPB {
     NOT_THE_LEADER = 7;
 
     // The number of replicas requested is greater than the number of live servers
-    // in the cluster.
+    // in the cluster or the configured maximum.
     REPLICATION_FACTOR_TOO_HIGH = 8;
 
     // The request or response involved a tablet which is not yet running.
@@ -66,6 +66,9 @@ message MasterErrorPB {
 
     // The number of replicas requested is even.
     EVEN_REPLICATION_FACTOR = 10;
+
+    // The number of replicas requested is illegal (eg non-positive).
+    ILLEGAL_REPLICATION_FACTOR = 11;
   }
 
   // The error code.


[8/9] kudu git commit: KUDU-1770 [c++ client] propagate timestamp for write operations

Posted by to...@apache.org.
KUDU-1770 [c++ client] propagate timestamp for write operations

Updated the Kudu C++ client library to propagate timestamp for write
operations.

This is a fix for
KUDU-1770 C++ client: propagate timestamp for write operations

This patch also enables the integration test which was failing prior
to this fix: ConsistencyITest.TestTimestampPropagationForWriteOps

Change-Id: I17feb6b2dacd0ba7c8b57464f1a50de99de2f772
Reviewed-on: http://gerrit.cloudera.org:8080/5269
Reviewed-by: David Ribeiro Alves <dr...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/e4f7e926
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/e4f7e926
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/e4f7e926

Branch: refs/heads/master
Commit: e4f7e926c4205ffec04b651afc178bd0b447a9ae
Parents: eb1f454
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Nov 29 14:58:31 2016 -0800
Committer: David Ribeiro Alves <dr...@apache.org>
Committed: Wed Nov 30 22:53:27 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/batcher.cc                      | 13 ++++++++++---
 src/kudu/integration-tests/consistency-itest.cc |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/e4f7e926/src/kudu/client/batcher.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc
index 61ff087..b1e9159 100644
--- a/src/kudu/client/batcher.cc
+++ b/src/kudu/client/batcher.cc
@@ -194,7 +194,8 @@ class WriteRpc : public RetriableRpc<RemoteTabletServer, WriteRequestPB, WriteRe
            vector<InFlightOp*> ops,
            const MonoTime& deadline,
            const shared_ptr<Messenger>& messenger,
-           const string& tablet_id);
+           const string& tablet_id,
+           uint64_t propagated_timestamp);
   virtual ~WriteRpc();
   string ToString() const override;
 
@@ -231,7 +232,8 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
                    vector<InFlightOp*> ops,
                    const MonoTime& deadline,
                    const shared_ptr<Messenger>& messenger,
-                   const string& tablet_id)
+                   const string& tablet_id,
+                   uint64_t propagated_timestamp)
     : RetriableRpc(replica_picker, request_tracker, deadline, messenger),
       batcher_(batcher),
       ops_(std::move(ops)),
@@ -250,6 +252,10 @@ WriteRpc::WriteRpc(const scoped_refptr<Batcher>& batcher,
       LOG(FATAL) << "Unsupported consistency mode: " << batcher->external_consistency_mode();
 
   }
+  // If set, propagate the latest observed timestamp.
+  if (PREDICT_TRUE(propagated_timestamp != KuduClient::kNoTimestamp)) {
+    req_.set_propagated_timestamp(propagated_timestamp);
+  }
 
   // Set up schema
   CHECK_OK(SchemaToPB(*schema, req_.mutable_schema(),
@@ -695,7 +701,8 @@ void Batcher::FlushBuffer(RemoteTablet* tablet, const vector<InFlightOp*>& ops)
                                ops,
                                deadline_,
                                client_->data_->messenger_,
-                               tablet->tablet_id());
+                               tablet->tablet_id(),
+                               client_->data_->GetLatestObservedTimestamp());
   rpc->SendRpc();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/e4f7e926/src/kudu/integration-tests/consistency-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/consistency-itest.cc b/src/kudu/integration-tests/consistency-itest.cc
index 2d763c4..10c0574 100644
--- a/src/kudu/integration-tests/consistency-itest.cc
+++ b/src/kudu/integration-tests/consistency-itest.cc
@@ -382,7 +382,7 @@ TEST_F(ConsistencyITest, TestTimestampPropagationFromScans) {
 // write operation. Since a write operation should always advance the server
 // clock, the resulting timestamp returned to the client should be strictly
 // greater than the propagated one.
-TEST_F(ConsistencyITest, DISABLED_TestTimestampPropagationForWriteOps) {
+TEST_F(ConsistencyITest, TestTimestampPropagationForWriteOps) {
   const int32_t offset_usec = FLAGS_max_clock_sync_error_usec;
   // Assuming the offset is specified as a positive number.
   ASSERT_GT(offset_usec, 0);


[4/9] kudu git commit: KUDU-1551. Ignore log segments which were preallocated but have no header.

Posted by to...@apache.org.
KUDU-1551. Ignore log segments which were preallocated but have no header.

This fixes a TS startup crash in the case that it finds a log segment
which is preallocated but has no valid magic or header at the start.

The new test injects such a crash and verifies that the server can
restart and replay its logs.

Change-Id: I843cf483b93823cbcc5506958f62cbb076569ca8
Reviewed-on: http://gerrit.cloudera.org:8080/5276
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b4764f98
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b4764f98
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b4764f98

Branch: refs/heads/master
Commit: b4764f98cc99a8b4f37e0be2c1ad90dc3c645903
Parents: 4aacaf6
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 30 11:15:52 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Nov 30 21:31:18 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/log-test.cc                  |  3 +-
 src/kudu/consensus/log_reader.cc                | 13 ++++++--
 src/kudu/consensus/log_util.cc                  | 18 +++++------
 src/kudu/consensus/log_util.h                   | 13 ++++++++
 src/kudu/integration-tests/ts_recovery-itest.cc | 33 ++++++++++++++++++++
 5 files changed, 66 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b4764f98/src/kudu/consensus/log-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log-test.cc b/src/kudu/consensus/log-test.cc
index f6ee85d..e819909 100644
--- a/src/kudu/consensus/log-test.cc
+++ b/src/kudu/consensus/log-test.cc
@@ -257,8 +257,7 @@ TEST_F(LogTest, TestLogNotTrimmed) {
 
 // Test that the reader will not fail if a log file is completely blank.
 // This happens when it's opened but nothing has been written.
-// The reader should gracefully handle this situation, but somehow expose that
-// the segment is uninitialized. See KUDU-140.
+// The reader should gracefully handle this situation. See KUDU-140.
 TEST_F(LogTest, TestBlankLogFile) {
   ASSERT_OK(BuildLog());
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4764f98/src/kudu/consensus/log_reader.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log_reader.cc b/src/kudu/consensus/log_reader.cc
index 4a86505..14b63c7 100644
--- a/src/kudu/consensus/log_reader.cc
+++ b/src/kudu/consensus/log_reader.cc
@@ -142,8 +142,17 @@ Status LogReader::Init(const string& tablet_wal_path) {
     if (HasPrefixString(log_file, FsManager::kWalFileNamePrefix)) {
       string fqp = JoinPathSegments(tablet_wal_path, log_file);
       scoped_refptr<ReadableLogSegment> segment;
-      RETURN_NOT_OK_PREPEND(ReadableLogSegment::Open(env, fqp, &segment),
-                            "Unable to open readable log segment");
+      Status s = ReadableLogSegment::Open(env, fqp, &segment);
+      if (s.IsUninitialized()) {
+        // This indicates that the segment was created but the writer
+        // crashed before the header was successfully written. In this
+        // case, we should skip it.
+        LOG(WARNING) << "Ignoring log segment " << log_file << " since it was uninitialized "
+                     << "(probably left after a prior tablet server crash)";
+        continue;
+      }
+
+      RETURN_NOT_OK_PREPEND(s, "Unable to open readable log segment");
       DCHECK(segment);
       CHECK(segment->IsInitialized()) << "Uninitialized segment at: " << segment->path();
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4764f98/src/kudu/consensus/log_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log_util.cc b/src/kudu/consensus/log_util.cc
index 780b517..a7cd07d 100644
--- a/src/kudu/consensus/log_util.cc
+++ b/src/kudu/consensus/log_util.cc
@@ -38,6 +38,7 @@
 #include "kudu/util/crc.h"
 #include "kudu/util/debug/trace_event.h"
 #include "kudu/util/env_util.h"
+#include "kudu/util/fault_injection.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/pb_util.h"
 #include "kudu/util/scoped_cleanup.h"
@@ -58,6 +59,10 @@ DEFINE_bool(log_async_preallocate_segments, true,
             "Whether the WAL segments preallocation should happen asynchronously");
 TAG_FLAG(log_async_preallocate_segments, advanced);
 
+DEFINE_double(fault_crash_before_write_log_segment_header, 0.0,
+              "Fraction of the time we will crash just before writing the log segment header");
+TAG_FLAG(fault_crash_before_write_log_segment_header, unsafe);
+
 namespace kudu {
 namespace log {
 
@@ -382,14 +387,6 @@ Status ReadableLogSegment::ReadFileSize() {
 Status ReadableLogSegment::ReadHeader() {
   uint32_t header_size;
   RETURN_NOT_OK(ReadHeaderMagicAndHeaderLength(&header_size));
-  if (header_size == 0) {
-    // If a log file has been pre-allocated but not initialized, then
-    // 'header_size' will be 0 even the file size is > 0; in this
-    // case, 'is_initialized_' remains set to false and return
-    // Status::OK() early. LogReader ignores segments where
-    // IsInitialized() returns false.
-    return Status::OK();
-  }
 
   if (header_size > kLogSegmentMaxHeaderOrFooterSize) {
     return Status::Corruption(
@@ -473,8 +470,7 @@ Status ReadableLogSegment::ParseHeaderMagicAndHeaderLength(const Slice &data,
       LOG(WARNING) << "Log segment file " << path() << " has 12 initial NULL bytes instead of "
                    << "magic and header length: " << data.ToDebugString()
                    << " and will be treated as a blank segment.";
-      *parsed_len = 0;
-      return Status::OK();
+      return Status::Uninitialized("log magic and header length are all NULL bytes");
     }
     // If no magic and not uninitialized, the file is considered corrupt.
     return Status::Corruption(Substitute("Invalid log segment file $0: Bad magic. $1",
@@ -713,6 +709,8 @@ WritableLogSegment::WritableLogSegment(string path,
       written_offset_(0) {}
 
 Status WritableLogSegment::WriteHeaderAndOpen(const LogSegmentHeaderPB& new_header) {
+  MAYBE_FAULT(FLAGS_fault_crash_before_write_log_segment_header);
+
   DCHECK(!IsHeaderWritten()) << "Can only call WriteHeader() once";
   DCHECK(new_header.IsInitialized())
       << "Log segment header must be initialized" << new_header.InitializationErrorString();

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4764f98/src/kudu/consensus/log_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/log_util.h b/src/kudu/consensus/log_util.h
index 8fd61d6..59b4826 100644
--- a/src/kudu/consensus/log_util.h
+++ b/src/kudu/consensus/log_util.h
@@ -265,10 +265,23 @@ class ReadableLogSegment : public RefCountedThreadSafe<ReadableLogSegment> {
 
   Status ReadFileSize();
 
+  // Read the log file magic and header protobuf into 'header_'. Sets 'first_entry_offset_'
+  // to indicate the start of the actual log data.
+  //
+  // Returns Uninitialized() if the file appears to be preallocated but never
+  // written.
   Status ReadHeader();
 
+  // Read the magic and header length from the top of the file, returning
+  // the header length in 'len'.
+  //
+  // Returns Uninitialized() if the file appears to be preallocated but never
+  // written.
   Status ReadHeaderMagicAndHeaderLength(uint32_t *len);
 
+  // Parse the magic and the PB-header length prefix from 'data'.
+  // In the case that 'data' is all '\0' bytes, indicating a preallocated
+  // but never-written segment, returns Status::Uninitialized().
   Status ParseHeaderMagicAndHeaderLength(const Slice &data, uint32_t *parsed_len);
 
   Status ReadFooter();

http://git-wip-us.apache.org/repos/asf/kudu/blob/b4764f98/src/kudu/integration-tests/ts_recovery-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc
index a3cdebf..3480cec 100644
--- a/src/kudu/integration-tests/ts_recovery-itest.cc
+++ b/src/kudu/integration-tests/ts_recovery-itest.cc
@@ -191,6 +191,39 @@ TEST_F(TsRecoveryITest, TestCrashDuringLogReplay) {
                                        MonoDelta::FromSeconds(30)));
 }
 
+// Regression test for KUDU-1551: if the tserver crashes after preallocating a segment
+// but before writing its header, the TS would previously crash on restart.
+// Instead, it should ignore the uninitialized segment.
+TEST_F(TsRecoveryITest, TestCrashBeforeWriteLogSegmentHeader) {
+  NO_FATALS(StartCluster({ "--log_segment_size_mb=1" }));
+
+  TestWorkload work(cluster_.get());
+  work.set_num_replicas(1);
+  work.set_write_timeout_millis(100);
+  work.set_timeout_allowed(true);
+  work.Setup();
+
+  // Enable the fault point after creating the table, but before writing any data.
+  // Otherwise, we'd crash during creation of the tablet.
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(0),
+                              "fault_crash_before_write_log_segment_header", "0.5"));
+  work.Start();
+
+  // Wait for the process to crash during log roll.
+  ASSERT_OK(cluster_->tablet_server(0)->WaitForCrash(MonoDelta::FromSeconds(60)));
+  work.StopAndJoin();
+
+  cluster_->tablet_server(0)->Shutdown();
+  ignore_result(cluster_->tablet_server(0)->Restart());
+
+  ClusterVerifier v(cluster_.get());
+  NO_FATALS(v.CheckRowCountWithRetries(work.table_name(),
+                                       ClusterVerifier::AT_LEAST,
+                                       work.rows_inserted(),
+                                       MonoDelta::FromSeconds(60)));
+}
+
+
 // A set of threads which pick rows which are known to exist in the table
 // and issue random updates against them.
 class UpdaterThreads {


[5/9] kudu git commit: KUDU-420 [i-tests] scan token timestamp propagation test

Posted by to...@apache.org.
KUDU-420 [i-tests] scan token timestamp propagation test

Added an integration test to verify timestamp propagation via scan
tokens for C++ client.

The test is disabled as it currently fails. A follow up patch will fix
the bug and enable the test.

This is in the context of the following JIRA item:
KUDU-420 c++ client: implement HT timestamp propagation via scan tokens

Change-Id: I47cd067248f4a26c4605f075ec5ee30da71f6f30
Reviewed-on: http://gerrit.cloudera.org:8080/5219
Reviewed-by: David Ribeiro Alves <dr...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/68dc1ffe
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/68dc1ffe
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/68dc1ffe

Branch: refs/heads/master
Commit: 68dc1ffeff2f6ad7ad8654da0df66162cb286cca
Parents: b4764f9
Author: Alexey Serbin <as...@cloudera.com>
Authored: Thu Nov 24 14:24:57 2016 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Nov 30 21:33:27 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/consistency-itest.cc | 111 ++++++++++++++++++-
 1 file changed, 107 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/68dc1ffe/src/kudu/integration-tests/consistency-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/consistency-itest.cc b/src/kudu/integration-tests/consistency-itest.cc
index 45efeb9..9d2a870 100644
--- a/src/kudu/integration-tests/consistency-itest.cc
+++ b/src/kudu/integration-tests/consistency-itest.cc
@@ -295,7 +295,8 @@ TEST_F(ConsistencyITest, TestTimestampPropagationFromScans) {
         0, MonoDelta::FromMilliseconds(100)));
 
     // Insert data into the first tablet (a.k.a. Ta).
-    ASSERT_OK(InsertTestRows(client.get(), table.get(), key_split_value_, 0));
+    const int rows_num = key_split_value_;  // fill in the partition completely
+    ASSERT_OK(InsertTestRows(client.get(), table.get(), rows_num, 0));
     size_t row_count;
     ASSERT_OK(GetRowCount(table.get(), KuduScanner::READ_LATEST, 0,
                           &row_count));
@@ -324,8 +325,9 @@ TEST_F(ConsistencyITest, TestTimestampPropagationFromScans) {
 
     // Inserting data into the second tablet (a.k.a. Tb): using the second
     // key range partition.
+    const int rows_num = key_split_value_;
     ASSERT_OK(InsertTestRows(client.get(), table.get(),
-                             key_split_value_, key_split_value_));
+                             rows_num, key_split_value_));
     // Retrieve the latest observed timestamp.
     ts_b = client->GetLatestObservedTimestamp();
   }
@@ -358,8 +360,8 @@ TEST_F(ConsistencyITest, TestTimestampPropagationFromScans) {
 // when scanning following tablets.
 //
 // The idea of the test is simple: have a scan spanned across two tablets
-// where the clocks of the corresponding tablet servers are skewed. The scenario
-// is as following:
+// where the clocks of the corresponding tablet servers are skewed. The sequence
+// of actions is as following:
 //
 //   1. Create a table which spans across two tablets.
 //
@@ -516,5 +518,106 @@ TEST_F(ConsistencyITest, TestSnapshotScanTimestampReuse) {
   }
 }
 
+// Verify that the propagated timestamp from a serialized scan token
+// makes its way into corresponding tablet servers while performing a scan
+// operation built from the token.
+//
+// The real-world use-cases behind this test assume a Kudu client (C++/Java)
+// can get scan tokens for a scan operation, serialize those and pass them
+// to the other Kudu client (C++/Java). Since de-serializing a scan token
+// propagates the latest observed timestamp, the latter client will have
+// the latest observed timestamp set accordingly if it de-serializes those
+// scan tokens into corresponding scan operations.
+//
+// The test scenario uses a table split into two tablets, each hosted by a
+// tablet server. The clock of the first tablet server is shifted into the
+// future. The first client inserts a row into the first tablet. Then it creates
+// a scan token to retrieve some "related" data from the second
+// tablet hosted by the second server. Now, another client receives the
+// serialized scan token and runs corresponding READ_AT_SNAPSHOT scan
+// with the specified timestamp to retrieve the data: it should observe
+// a timestamp which is not less than the propagated timestamp
+// encoded in the token.
+TEST_F(ConsistencyITest, DISABLED_TestScanTokenTimestampPropagation) {
+  const int32_t offset_usec = FLAGS_max_clock_sync_error_usec;
+
+  // Need to have at least one row in the first partition with
+  // values starting at 0.
+  ASSERT_GE(key_split_value_, 1);
+
+  {
+    // Prepare the setup: create a proper disposition for tablet servers' clocks
+    // and populate the table with initial data.
+    shared_ptr<KuduClient> client;
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client));
+    ASSERT_OK(CreateTable(client.get(), table_name_));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client->OpenTable(table_name_, &table));
+
+    // Insert a single row into the second tablet: it's necessary to get
+    // non-empty scan in the verification phase of the test.
+    ASSERT_OK(InsertTestRows(client.get(), table.get(), 1, key_split_value_));
+  }
+
+  uint64_t ts_ref;
+  string scan_token;
+  {
+    shared_ptr<KuduClient> client;
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client));
+
+    // Advance the clock of the server hosting the first partition tablet.
+    const int32_t row_key = 0;
+    ASSERT_OK(UpdateClockForTabletHostingKey(
+        row_key, MonoDelta::FromMicroseconds(offset_usec)));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client->OpenTable(table_name_, &table));
+    // Insert just a single row into the first tablet.
+    ASSERT_OK(InsertTestRows(client.get(), table.get(), 1, row_key));
+    ts_ref = client->GetLatestObservedTimestamp();
+
+    // Create and serialize a scan token: the scan selects a row by its key
+    // from the other tablet at the timestamp at which the first row was
+    // inserted.
+    vector<KuduScanToken*> tokens;
+    ElementDeleter deleter(&tokens);
+    KuduScanTokenBuilder builder(table.get());
+    ASSERT_OK(builder.SetReadMode(KuduScanner::READ_AT_SNAPSHOT));
+    unique_ptr<KuduPredicate> predicate(table->NewComparisonPredicate(
+        key_column_name_,
+        KuduPredicate::EQUAL,
+        KuduValue::FromInt(key_split_value_)));
+    ASSERT_OK(builder.AddConjunctPredicate(predicate.release()));
+    ASSERT_OK(builder.Build(&tokens));
+    ASSERT_EQ(1, tokens.size());
+    ASSERT_OK(tokens[0]->Serialize(&scan_token));
+  }
+
+  // The other client: scan the second tablet using a scanner built from
+  // the serialized scanner token. If the client propagates timestamp from the
+  // de-serialized scan token, upon fetching a batch of rows the client
+  // should observe timestamp not less than the reference propagated timestamp
+  // encoded in the token.
+  {
+    shared_ptr<KuduClient> client;
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client->OpenTable(table_name_, &table));
+    KuduScanner* scanner_raw;
+    ASSERT_OK(KuduScanToken::DeserializeIntoScanner(client.get(), scan_token,
+                                                    &scanner_raw));
+    unique_ptr<KuduScanner> scanner(scanner_raw);
+    ASSERT_OK(scanner->Open());
+    ASSERT_TRUE(scanner->HasMoreRows());
+    size_t row_count = 0;
+    while (scanner->HasMoreRows()) {
+      KuduScanBatch batch;
+      ASSERT_OK(scanner->NextBatch(&batch));
+      row_count += batch.NumRows();
+      ASSERT_LE(ts_ref, client->GetLatestObservedTimestamp());
+    }
+    EXPECT_EQ(1, row_count);
+  }
+}
+
 } // namespace client
 } // namespace kudu


[6/9] kudu git commit: KUDU-420 [c++ client] timestamp propagation via scan tokens

Posted by to...@apache.org.
KUDU-420 [c++ client] timestamp propagation via scan tokens

Implemented server timestamp propagation via scan tokens for the
Kudu C++ client library. Added corresponding unit test as well.

Besides, this changelist also enables the
ConsistencyITest.TestScanTokenTimestampPropagation test: this patch
brings the necessary fix to make it pass.

This is in the context of the following JIRA item:
KUDU-420 c++ client: implement HT timestamp propagation via scan tokens

Change-Id: I5c76c20b62cb91695c69f7dc4b98f4dc98bf02cc
Reviewed-on: http://gerrit.cloudera.org:8080/5220
Reviewed-by: David Ribeiro Alves <dr...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/d81aa80a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/d81aa80a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/d81aa80a

Branch: refs/heads/master
Commit: d81aa80a98bcec4b928e4a50c3c0a95f09bb57af
Parents: 68dc1ff
Author: Alexey Serbin <as...@cloudera.com>
Authored: Thu Nov 24 14:33:01 2016 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Nov 30 21:33:34 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/client.h                        |  1 +
 src/kudu/client/scan_token-internal.cc          | 10 +--
 src/kudu/client/scan_token-internal.h           |  1 -
 src/kudu/client/scan_token-test.cc              | 80 ++++++++++++++++++++
 src/kudu/integration-tests/consistency-itest.cc |  2 +-
 5 files changed, 86 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/d81aa80a/src/kudu/client/client.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/client.h b/src/kudu/client/client.h
index 9b72f97..506a19f 100644
--- a/src/kudu/client/client.h
+++ b/src/kudu/client/client.h
@@ -464,6 +464,7 @@ class KUDU_EXPORT KuduClient : public sp::enable_shared_from_this<KuduClient> {
   friend class ClientTest;
   friend class KuduClientBuilder;
   friend class KuduScanner;
+  friend class KuduScanToken;
   friend class KuduScanTokenBuilder;
   friend class KuduSession;
   friend class KuduTable;

http://git-wip-us.apache.org/repos/asf/kudu/blob/d81aa80a/src/kudu/client/scan_token-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scan_token-internal.cc b/src/kudu/client/scan_token-internal.cc
index c0d327d..678b915 100644
--- a/src/kudu/client/scan_token-internal.cc
+++ b/src/kudu/client/scan_token-internal.cc
@@ -51,9 +51,6 @@ KuduScanToken::Data::Data(KuduTable* table,
       tablet_(std::move(tablet)) {
 }
 
-KuduScanToken::Data::~Data() {
-}
-
 Status KuduScanToken::Data::IntoKuduScanner(KuduScanner** scanner) const {
   return PBIntoScanner(table_->client(), message_, scanner);
 }
@@ -154,12 +151,12 @@ Status KuduScanToken::Data::PBIntoScanner(KuduClient* client,
     RETURN_NOT_OK(scan_builder->SetSnapshotRaw(message.snap_timestamp()));
   }
 
+  RETURN_NOT_OK(scan_builder->SetCacheBlocks(message.cache_blocks()));
+
   if (message.has_propagated_timestamp()) {
-    // TODO(KUDU-420)
+    client->data_->UpdateLatestObservedTimestamp(message.propagated_timestamp());
   }
 
-  RETURN_NOT_OK(scan_builder->SetCacheBlocks(message.cache_blocks()));
-
   *scanner = scan_builder.release();
   return Status::OK();
 }
@@ -219,6 +216,7 @@ Status KuduScanTokenBuilder::Data::Build(vector<KuduScanToken*>* tokens) {
 
   pb.set_cache_blocks(configuration_.spec().cache_blocks());
   pb.set_fault_tolerant(configuration_.is_fault_tolerant());
+  pb.set_propagated_timestamp(client->GetLatestObservedTimestamp());
 
   MonoTime deadline = MonoTime::Now() + client->default_admin_operation_timeout();
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/d81aa80a/src/kudu/client/scan_token-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/scan_token-internal.h b/src/kudu/client/scan_token-internal.h
index 65bef42..dd26517 100644
--- a/src/kudu/client/scan_token-internal.h
+++ b/src/kudu/client/scan_token-internal.h
@@ -33,7 +33,6 @@ class KuduScanToken::Data {
   explicit Data(KuduTable* table,
                 ScanTokenPB message,
                 std::unique_ptr<KuduTablet> tablet);
-  ~Data();
 
   Status IntoKuduScanner(KuduScanner** scanner) const;
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/d81aa80a/src/kudu/client/scan_token-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scan_token-test.cc b/src/kudu/client/scan_token-test.cc
index 2a26b69..8875315 100644
--- a/src/kudu/client/scan_token-test.cc
+++ b/src/kudu/client/scan_token-test.cc
@@ -23,6 +23,7 @@
 #include <vector>
 
 #include "kudu/client/client.h"
+#include "kudu/client/client.pb.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/integration-tests/mini_cluster.h"
 #include "kudu/tserver/mini_tablet_server.h"
@@ -322,5 +323,84 @@ TEST_F(ScanTokenTest, TestScanTokensWithNonCoveringRange) {
   }
 }
 
+// When building a scanner from a serialized scan token,
+// verify that the propagated timestamp from the token makes its way into the
+// latest observed timestamp of the client object.
+TEST_F(ScanTokenTest, TestTimestampPropagation) {
+  static const string kTableName = "p_ts_table";
+
+  // Create a table to work with:
+  //   * Deserializing a scan token into a scanner requires the table to exist.
+  //   * Creating a scan token requires the table to exist.
+  shared_ptr<KuduTable> table;
+  {
+    static const string kKeyColumnName = "c_key";
+    KuduSchema schema;
+    {
+      KuduSchemaBuilder builder;
+      builder.AddColumn(kKeyColumnName)->NotNull()->
+          Type(KuduColumnSchema::INT64)->PrimaryKey();
+      ASSERT_OK(builder.Build(&schema));
+    }
+
+    {
+      unique_ptr<KuduPartialRow> split(schema.NewRow());
+      ASSERT_OK(split->SetInt64(kKeyColumnName, 0));
+      unique_ptr<client::KuduTableCreator> creator(client_->NewTableCreator());
+      ASSERT_OK(creator->table_name(kTableName)
+                .schema(&schema)
+                .add_hash_partitions({ kKeyColumnName }, 2)
+                .split_rows({ split.release() })
+                .num_replicas(1)
+                .Create());
+    }
+  }
+
+  // Deserialize a scan token and make sure the client's last observed timestamp
+  // is updated accordingly.
+  {
+    const uint64_t ts_prev = client_->GetLatestObservedTimestamp();
+    const uint64_t ts_propagated = ts_prev + 1000000;
+
+    ScanTokenPB pb;
+    pb.set_table_name(kTableName);
+    pb.set_read_mode(::kudu::READ_AT_SNAPSHOT);
+    pb.set_propagated_timestamp(ts_propagated);
+    const string serialized_token = pb.SerializeAsString();
+    EXPECT_EQ(ts_prev, client_->GetLatestObservedTimestamp());
+
+    KuduScanner* scanner_raw;
+    ASSERT_OK(KuduScanToken::DeserializeIntoScanner(client_.get(),
+                                                    serialized_token,
+                                                    &scanner_raw));
+    // The caller of the DeserializeIntoScanner() is responsible for
+    // de-allocating the result scanner object.
+    unique_ptr<KuduScanner> scanner(scanner_raw);
+    EXPECT_EQ(ts_propagated, client_->GetLatestObservedTimestamp());
+  }
+
+  // Build the set of scan tokens for the table, serialize them and
+  // make sure the serialized tokens contain the propagated timestamp.
+  {
+    ASSERT_OK(client_->OpenTable(kTableName, &table));
+    const uint64_t ts_prev = client_->GetLatestObservedTimestamp();
+    const uint64_t ts_propagated = ts_prev + 1000000;
+
+    client_->SetLatestObservedTimestamp(ts_propagated);
+    vector<KuduScanToken*> tokens;
+    ElementDeleter deleter(&tokens);
+    ASSERT_OK(KuduScanTokenBuilder(table.get()).Build(&tokens));
+    for (const auto* t : tokens) {
+      string serialized_token;
+      ASSERT_OK(t->Serialize(&serialized_token));
+
+      ScanTokenPB pb;
+      ASSERT_TRUE(pb.ParseFromString(serialized_token));
+      ASSERT_TRUE(pb.has_propagated_timestamp());
+      EXPECT_EQ(ts_propagated, pb.propagated_timestamp());
+    }
+  }
+}
+
 } // namespace client
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/d81aa80a/src/kudu/integration-tests/consistency-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/consistency-itest.cc b/src/kudu/integration-tests/consistency-itest.cc
index 9d2a870..a69f128 100644
--- a/src/kudu/integration-tests/consistency-itest.cc
+++ b/src/kudu/integration-tests/consistency-itest.cc
@@ -538,7 +538,7 @@ TEST_F(ConsistencyITest, TestSnapshotScanTimestampReuse) {
 // with the specified timestamp to retrieve the data: it should observe
 // a timestamp which is not less than the propagated timestamp
 // encoded in the token.
-TEST_F(ConsistencyITest, DISABLED_TestScanTokenTimestampPropagation) {
+TEST_F(ConsistencyITest, TestScanTokenTimestampPropagation) {
   const int32_t offset_usec = FLAGS_max_clock_sync_error_usec;
 
   // Need to have at least one row in the first partition with


[2/9] kudu git commit: KUDU-1754. Add a warning to the Java API docs that columns default NOT NULL

Posted by to...@apache.org.
KUDU-1754. Add a warning to the Java API docs that columns default NOT NULL

Change-Id: Ia5589746f8b618874326fce69532578468e8ca4a
Reviewed-on: http://gerrit.cloudera.org:8080/5283
Reviewed-by: Dan Burkert <da...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/85245160
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/85245160
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/85245160

Branch: refs/heads/master
Commit: 85245160b73055b548f8a0b79be1786212cc6afb
Parents: 885d2e9
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 30 12:37:21 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Nov 30 21:21:46 2016 +0000

----------------------------------------------------------------------
 .../src/main/java/org/apache/kudu/ColumnSchema.java           | 7 +++++++
 1 file changed, 7 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/85245160/java/kudu-client/src/main/java/org/apache/kudu/ColumnSchema.java
----------------------------------------------------------------------
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/ColumnSchema.java b/java/kudu-client/src/main/java/org/apache/kudu/ColumnSchema.java
index e4d93e4..250ab56 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/ColumnSchema.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/ColumnSchema.java
@@ -239,6 +239,13 @@ public class ColumnSchema {
 
     /**
      * Marks the column as allowing null values. False by default.
+     * <p>
+     * <strong>NOTE:</strong> the "not-nullable-by-default" behavior here differs from
+     * the equivalent API in the Python and C++ clients. It also differs from the
+     * standard behavior of SQL <code>CREATE TABLE</code> statements. It is
+     * recommended to always specify nullability explicitly using this API
+     * in order to avoid confusion.
+     *
      * @param nullable a boolean that indicates if the column allows null values
      * @return this instance
      */


[3/9] kudu git commit: KUDU-1764: truncate preallocated space off of full lbm containers

Posted by to...@apache.org.
KUDU-1764: truncate preallocated space off of full lbm containers

The LBM maintains a preallocation window at the end of each container, but
it erroneously leaves the window intact even as a container becomes full.

This patch addresses that in two ways:
1. The window is truncated when a container becomes full, and
2. It is truncated at startup after a full container is loaded.

The first trims the window in real time, while the second handles both
existing deployments and the edge case where a tserver has crashed mid-trim.

The implementation uses total_bytes_written_ as the truncation offset, which
means it still leaves last_block_size % filesystem_block_size bytes behind.
This is equivalent to the internal fragmentation caused by aligning block
offsets to the nearest filesystem block size, so I don't think it matters.

I snuck in a change to remove the "filesystem does not support
preallocation" behavior from block-manager-test; it's not possible to create
an LBM on a filesystem sans hole punching; such a filesystem should support
preallocation by definition.

Change-Id: Ib9173f955e53d096bfd9d3ddacf4294846cff11a
Reviewed-on: http://gerrit.cloudera.org:8080/5254
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/4aacaf6f
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/4aacaf6f
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/4aacaf6f

Branch: refs/heads/master
Commit: 4aacaf6fc57b26195634c09c0433a24f5d4a778e
Parents: 8524516
Author: Adar Dembo <ad...@cloudera.com>
Authored: Mon Nov 28 17:31:29 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Nov 30 21:24:01 2016 +0000

----------------------------------------------------------------------
 src/kudu/fs/block_manager-test.cc | 65 +++++++++++++++++++++++++++-------
 src/kudu/fs/log_block_manager.cc  | 39 ++++++++++++++++++++
 2 files changed, 92 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/4aacaf6f/src/kudu/fs/block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager-test.cc b/src/kudu/fs/block_manager-test.cc
index b484a4b..a854717 100644
--- a/src/kudu/fs/block_manager-test.cc
+++ b/src/kudu/fs/block_manager-test.cc
@@ -326,26 +326,22 @@ void BlockManagerTest<LogBlockManager>::RunLogContainerPreallocationTest() {
   ASSERT_OK(written_block->Append(kTestData));
   ASSERT_OK(written_block->Close());
 
-  // We expect the container size to either be equal to the test data size (if
-  // preallocation isn't supported) or equal to the preallocation amount, which
-  // we know is greater than the test data size.
+  // We expect the container size to be equal to the preallocation amount,
+  // which we know is greater than the test data size.
   string container_data_filename;
   NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
   uint64_t size;
   ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size));
-  ASSERT_TRUE(size == kTestData.size() ||
-              size == FLAGS_log_container_preallocate_bytes);
+  ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size);
 
-  // Upon writing a second block, we'd expect the container to either double in
-  // size (without preallocation) or remain the same size (with preallocation).
+  // Upon writing a second block, we'd expect the container to remain the same
+  // size.
   ASSERT_OK(bm_->CreateBlock(&written_block));
   ASSERT_OK(written_block->Append(kTestData));
   ASSERT_OK(written_block->Close());
   NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
   ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size));
-  ASSERT_TRUE(size == kTestData.size() * 2 ||
-              size == FLAGS_log_container_preallocate_bytes);
-
+  ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size);
 
   // Now reopen the block manager and create another block. The block manager
   // should be smart enough to reuse the previously preallocated amount.
@@ -358,8 +354,7 @@ void BlockManagerTest<LogBlockManager>::RunLogContainerPreallocationTest() {
   ASSERT_OK(written_block->Close());
   NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
   ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size));
-  ASSERT_TRUE(size == kTestData.size() * 3 ||
-              size == FLAGS_log_container_preallocate_bytes);
+  ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size);
 }
 
 template <>
@@ -1027,6 +1022,52 @@ TEST_F(LogBlockManagerTest, TestAppendExceedsPreallocation) {
   ASSERT_OK(writer->Append("hello world"));
 }
 
+TEST_F(LogBlockManagerTest, TestPreallocationAndTruncation) {
+  RETURN_NOT_LOG_BLOCK_MANAGER();
+
+  // Ensure preallocation window is greater than the container size itself.
+  FLAGS_log_container_max_size = 1024 * 1024;
+  FLAGS_log_container_preallocate_bytes = 32 * 1024 * 1024;
+
+  // Fill up one container.
+  gscoped_ptr<WritableBlock> writer;
+  ASSERT_OK(bm_->CreateBlock(&writer));
+  unique_ptr<uint8_t[]> data(new uint8_t[FLAGS_log_container_max_size]);
+  memset(data.get(), 0, FLAGS_log_container_max_size);
+  ASSERT_OK(writer->Append({ data.get(), FLAGS_log_container_max_size } ));
+  string fname;
+  NO_FATALS(GetOnlyContainerDataFile(&fname));
+  uint64_t size_after_append;
+  ASSERT_OK(env_->GetFileSize(fname, &size_after_append));
+  ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size_after_append);
+
+  // Close it. The extra preallocated space should be truncated off the file.
+  ASSERT_OK(writer->Close());
+  uint64_t size_after_close;
+  ASSERT_OK(env_->GetFileSize(fname, &size_after_close));
+  ASSERT_EQ(FLAGS_log_container_max_size, size_after_close);
+
+  // For the sake of testing, artificially double the file's size.
+  unique_ptr<RWFile> data_file;
+  RWFileOptions opts;
+  opts.mode = Env::OPEN_EXISTING;
+  ASSERT_OK(env_->NewRWFile(opts, fname, &data_file));
+  ASSERT_OK(data_file->PreAllocate(size_after_close, size_after_close));
+  uint64_t size_after_preallocate;
+  ASSERT_OK(env_->GetFileSize(fname, &size_after_preallocate));
+  ASSERT_EQ(size_after_close * 2, size_after_preallocate);
+
+  // Now reopen the block manager. It should notice that the container grew
+  // and truncate the extra preallocated space off again.
+  ASSERT_OK(ReopenBlockManager(scoped_refptr<MetricEntity>(),
+                               shared_ptr<MemTracker>(),
+                               { this->test_dir_ },
+                               false));
+  uint64_t size_after_reopen;
+  ASSERT_OK(env_->GetFileSize(fname, &size_after_reopen));
+  ASSERT_EQ(FLAGS_log_container_max_size, size_after_reopen);
+}
+
 TYPED_TEST(BlockManagerTest, TestDiskSpaceCheck) {
   // Reopen the block manager with metrics enabled.
   MetricRegistry registry;

http://git-wip-us.apache.org/repos/asf/kudu/blob/4aacaf6f/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 874a282..cce6d84 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -234,6 +234,16 @@ class LogBlockContainer {
   // TODO(unknown): Add support to synchronize just a range.
   Status SyncMetadata();
 
+  // Truncates this container's data file to 'total_bytes_written_' if it is
+  // full. This effectively removes any preallocated but unused space.
+  //
+  // Should be called only when 'total_bytes_written_' is up-to-date with
+  // respect to the data on disk (i.e. after the container's records have
+  // been loaded), otherwise data may be lost!
+  //
+  // This function is thread unsafe.
+  Status TruncateDataToTotalBytesWritten();
+
   // Reads the container's metadata from disk, sanity checking and
   // returning the records.
   Status ReadContainerRecords(deque<BlockRecordPB>* records) const;
@@ -438,6 +448,15 @@ Status LogBlockContainer::Open(LogBlockManager* block_manager,
   return Status::OK();
 }
 
+Status LogBlockContainer::TruncateDataToTotalBytesWritten() {
+  if (full() && preallocated_offset_ > total_bytes_written_) {
+    VLOG(2) << Substitute("Truncating container $0 to offset $1",
+                          ToString(), total_bytes_written_);
+    RETURN_NOT_OK(data_file_->Truncate(total_bytes_written_));
+  }
+  return Status::OK();
+}
+
 Status LogBlockContainer::ReadContainerRecords(deque<BlockRecordPB>* records) const {
   string metadata_path = metadata_file_->filename();
   unique_ptr<RandomAccessFile> metadata_reader;
@@ -539,6 +558,14 @@ Status LogBlockContainer::FinishBlock(const Status& s, WritableBlock* block) {
                                      block->BytesAppended()));
   UpdateBytesWritten(0);
 
+  // Truncate the container if it's now full; any left over preallocated space
+  // is no longer needed.
+  //
+  // Note that this take places _after_ the container has been synced to disk.
+  // That's OK; truncation isn't needed for correctness, and in the event of a
+  // crash, it will be retried at startup.
+  RETURN_NOT_OK(TruncateDataToTotalBytesWritten());
+
   if (full() && block_manager()->metrics()) {
     block_manager()->metrics()->full_containers->Increment();
   }
@@ -1479,6 +1506,18 @@ void LogBlockManager::OpenDataDir(DataDir* dir,
       ProcessBlockRecord(r, container.get(), &blocks_in_container);
       max_block_id = std::max(max_block_id, r.block_id().id());
     }
+
+    // Having processed the block records, it is now safe to truncate the
+    // preallocated space off of the end of the container. This is a no-op for
+    // non-full containers, where excess preallocated space is expected to be
+    // (eventually) used.
+    s = container->TruncateDataToTotalBytesWritten();
+    if (!s.ok()) {
+      *result_status = s.CloneAndPrepend(Substitute(
+          "Could not truncate container $0", container->ToString()));
+      return;
+    }
+
     next_block_id_.StoreMax(max_block_id + 1);
 
     // Under the lock, merge this map into the main block map and add


[7/9] kudu git commit: KUDU-1770 [i-tests] test for timestamp propagation with write ops

Posted by to...@apache.org.
KUDU-1770 [i-tests] test for timestamp propagation with write ops

Added an integration test to verify that C++ client propagates
timestamp with write operations.

The test is disabled as it currently fails. A follow up patch will fix
the bug and enable the test.

This is in the context of the following JIRA item:
KUDU-1770 C++ client: propagate timestamp for write operations

Change-Id: I1be285fd860f0608f5d4ce3be57c4e4b04c63455
Reviewed-on: http://gerrit.cloudera.org:8080/5268
Reviewed-by: David Ribeiro Alves <dr...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/eb1f4540
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/eb1f4540
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/eb1f4540

Branch: refs/heads/master
Commit: eb1f45404745d30a5de62eda407862b701c7fb26
Parents: d81aa80
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Nov 29 15:49:34 2016 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Nov 30 22:34:17 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/consistency-itest.cc | 80 ++++++++++++++++++++
 1 file changed, 80 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/eb1f4540/src/kudu/integration-tests/consistency-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/consistency-itest.cc b/src/kudu/integration-tests/consistency-itest.cc
index a69f128..2d763c4 100644
--- a/src/kudu/integration-tests/consistency-itest.cc
+++ b/src/kudu/integration-tests/consistency-itest.cc
@@ -353,6 +353,86 @@ TEST_F(ConsistencyITest, TestTimestampPropagationFromScans) {
   }
 }
 
+// Make sure the client propagates the timestamp for write operations.
+//
+// The idea of verification is simple:
+//
+//   * Let's get two tablet servers, where the clock of the first server
+//     is ahead of the second one.
+//
+//   * Create a client object.
+//
+//   * Using the newly created client object, insert some data into the tablet
+//     hosted by the first server.
+//
+//   * Record the client's latest observed timestamp.
+//
+//   * Using the same client object, insert some data into the tablet
+//     hosted by the second server.
+//
+//   * Get the client's latest observed timestamp: it should be strictly greater
+//     than the recorded timestamp.
+//
+//   * Make a full table scan at in READ_AT_TIMESTAMP mode at 'ts_ref'
+//     timestamp: the scan should retrieve only the first row.
+//
+// If the client propates the timestamps, the second server should receive
+// the recorded timestamp value in write request in the 'propagated_timestamp'
+// field and adjust its clock first. After that it should perform the requested
+// write operation. Since a write operation should always advance the server
+// clock, the resulting timestamp returned to the client should be strictly
+// greater than the propagated one.
+TEST_F(ConsistencyITest, DISABLED_TestTimestampPropagationForWriteOps) {
+  const int32_t offset_usec = FLAGS_max_clock_sync_error_usec;
+  // Assuming the offset is specified as a positive number.
+  ASSERT_GT(offset_usec, 0);
+  // Need to have at least one row in the first partition starting with key 0.
+  ASSERT_LE(1, key_split_value_);
+
+  uint64_t ts_ref;
+  {
+    shared_ptr<KuduClient> client;
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client));
+    ASSERT_OK(CreateTable(client.get(), table_name_));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client->OpenTable(table_name_, &table));
+
+    // Advance tablet server's clock hosting the first key range
+    // (i.e. for the row which is about to be inserted below).
+    ASSERT_OK(UpdateClockForTabletHostingKey(
+        0, MonoDelta::FromMicroseconds(offset_usec)));
+
+    // Insert 1 row into the first tablet.
+    ASSERT_OK(InsertTestRows(client.get(), table.get(), 1, 0));
+    // Retrieve the latest observed timestamp.
+    ts_ref = client->GetLatestObservedTimestamp();
+
+    // Insert 1 row into the second tablet.
+    ASSERT_OK(InsertTestRows(client.get(), table.get(), 1, key_split_value_));
+    // Retrieve the latest observed timestamp.
+    const uint64_t ts = client->GetLatestObservedTimestamp();
+
+    // If the client propagates the timestamp with write operations,
+    // the timestamp received from the second server should be greater
+    // than the timestamp received from the first server.
+    EXPECT_GT(ts, ts_ref);
+  }
+
+  // An additional check: scan the table at the 'ts_ref' timestamp and
+  // make sure only the first row is visible.
+  {
+    shared_ptr<KuduClient> client;
+    ASSERT_OK(cluster_->CreateClient(nullptr, &client));
+    shared_ptr<KuduTable> table;
+    ASSERT_OK(client->OpenTable(table_name_, &table));
+
+    size_t row_count;
+    ASSERT_OK(GetRowCount(table.get(), KuduScanner::READ_AT_SNAPSHOT,
+                          ts_ref, &row_count));
+    EXPECT_EQ(1, row_count);
+  }
+}
+
 // This is a test for KUDU-1189. It verifies that in case of a READ_AT_SNAPSHOT
 // scan with unspecified snapshot timestamp, the scanner picks timestamp from
 // the first server that the data is read from. If the scan spans multiple