You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/06/16 15:22:20 UTC

[kudu] 01/02: KUDU-2671 more robust convention on specifying range bounds

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 295b4903bc69fabb3cb36f618022d465c91954c7
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Fri Jun 3 14:29:58 2022 -0700

    KUDU-2671 more robust convention on specifying range bounds
    
    This patch updates the code of the catalog manager to adhere to a more
    robust convention on specifying the information for the range partition
    boundaries when creating a table with custom hash schemas per range.
    
    Prior to this patch, the catalog manager required both the
    CreateTableRequestPB::split_rows_range_bounds and the
    CreateTableRequestPB::partition_schema::custom_hash_schema_ranges fields
    to have the same number of elements, assuming the former had the ranges
    exactly corresponding to the latter, where the latter would also had
    information on hash schema for each range correspondingly.  In addition
    to duplicating the information unnecessarily, that approach was also
    a bit brittle from the standpoint of keeping good API practices.
    
    This patch updates the code to use a new convention: if there is at
    least one range partition with custom hash schema in CreateTable RPC,
    all the information on range boundaries and hash schemas should be
    presented only via one field:
    CreateTableRequestPB::partition_schema::custom_hash_schema_ranges.
    That's better than the previous convention because:
      * it's more robust as explained above
      * it naturally follows the restriction of not allowing the split
        rows along with range partitions with custom hash schemas
    
    Also, I updated already existing tests and added extra test scenarios
    to cover the updated functionality.
    
    Change-Id: I14073e72178e6bb85bae719ad377c5bb05f8dd55
    Reviewed-on: http://gerrit.cloudera.org:8080/18590
    Tested-by: Alexey Serbin <al...@apache.org>
    Reviewed-by: Mahesh Reddy <mr...@cloudera.com>
    Reviewed-by: Attila Bukor <ab...@apache.org>
---
 src/kudu/client/client.cc                          | 14 ++--
 .../integration-tests/table_locations-itest.cc     | 72 +++++++++----------
 src/kudu/master/catalog_manager.cc                 | 21 +++---
 src/kudu/master/master-test.cc                     | 84 ++++++++++++++--------
 4 files changed, 101 insertions(+), 90 deletions(-)

diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc
index 61349527a..7d11315f4 100644
--- a/src/kudu/client/client.cc
+++ b/src/kudu/client/client.cc
@@ -971,24 +971,26 @@ Status KuduTableCreator::Create() {
       return Status::InvalidArgument("range bounds must not be null");
     }
 
-    RowOperationsPB_Type lower_bound_type =
+    const RowOperationsPB_Type lower_bound_type =
         range->lower_bound_type_ == KuduTableCreator::INCLUSIVE_BOUND
         ? RowOperationsPB::RANGE_LOWER_BOUND
         : RowOperationsPB::EXCLUSIVE_RANGE_LOWER_BOUND;
 
-    RowOperationsPB_Type upper_bound_type =
+    const RowOperationsPB_Type upper_bound_type =
         range->upper_bound_type_ == KuduTableCreator::EXCLUSIVE_BOUND
         ? RowOperationsPB::RANGE_UPPER_BOUND
         : RowOperationsPB::INCLUSIVE_RANGE_UPPER_BOUND;
 
-    splits_encoder.Add(lower_bound_type, *range->lower_bound_);
-    splits_encoder.Add(upper_bound_type, *range->upper_bound_);
-
-    if (has_range_with_custom_hash_schema) {
+    if (!has_range_with_custom_hash_schema) {
+      splits_encoder.Add(lower_bound_type, *range->lower_bound_);
+      splits_encoder.Add(upper_bound_type, *range->upper_bound_);
+    } else {
       auto* range_pb = partition_schema->add_custom_hash_schema_ranges();
       RowOperationsPBEncoder encoder(range_pb->mutable_range_bounds());
       encoder.Add(lower_bound_type, *range->lower_bound_);
       encoder.Add(upper_bound_type, *range->upper_bound_);
+      // Now, after adding the information range bounds, add the information
+      // on hash schema for the range.
       if (range->is_table_wide_hash_schema_) {
         // With the presence of a range with custom hash schema when the
         // table-wide hash schema is used for this particular range, also add an
diff --git a/src/kudu/integration-tests/table_locations-itest.cc b/src/kudu/integration-tests/table_locations-itest.cc
index a274cdeb7..babf660cf 100644
--- a/src/kudu/integration-tests/table_locations-itest.cc
+++ b/src/kudu/integration-tests/table_locations-itest.cc
@@ -162,12 +162,18 @@ class TableLocationsTest : public KuduTest {
   };
   typedef vector<HashDimension> HashSchema;
 
+  struct RangeWithHashSchema {
+    KuduPartialRow lower;
+    KuduPartialRow upper;
+    HashSchema hash_schema;
+  };
+
   Status CreateTable(
       const string& table_name,
       const Schema& schema,
       const vector<KuduPartialRow>& split_rows = {},
       const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds = {},
-      const vector<HashSchema>& range_hash_schemas = {},
+      const vector<RangeWithHashSchema>& ranges_with_hash_schemas = {},
       const HashSchema& table_hash_schema = {});
 
   void CreateTable(const string& table_name, int num_splits);
@@ -185,14 +191,8 @@ Status TableLocationsTest::CreateTable(
     const Schema& schema,
     const vector<KuduPartialRow>& split_rows,
     const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds,
-    const vector<HashSchema>& range_hash_schemas,
+    const vector<RangeWithHashSchema>& ranges_with_hash_schemas,
     const HashSchema& table_hash_schema) {
-
-  if (!range_hash_schemas.empty() && range_hash_schemas.size() != bounds.size()) {
-    return Status::InvalidArgument(
-        "'bounds' and 'range_hash_schemas' must be of the same size");
-  }
-
   CreateTableRequestPB req;
   req.set_name(table_name);
   RETURN_NOT_OK(SchemaToPB(schema, req.mutable_schema()));
@@ -206,14 +206,12 @@ Status TableLocationsTest::CreateTable(
   }
 
   auto* ps_pb = req.mutable_partition_schema();
-  for (size_t i = 0; i < range_hash_schemas.size(); ++i) {
-    const auto& bound = bounds[i];
-    const auto& hash_schema = range_hash_schemas[i];
+  for (const auto& range_and_hash_schema : ranges_with_hash_schemas) {
     auto* range = ps_pb->add_custom_hash_schema_ranges();
     RowOperationsPBEncoder encoder(range->mutable_range_bounds());
-    encoder.Add(RowOperationsPB::RANGE_LOWER_BOUND, bound.first);
-    encoder.Add(RowOperationsPB::RANGE_UPPER_BOUND, bound.second);
-    for (const auto& hash_dimension : hash_schema) {
+    encoder.Add(RowOperationsPB::RANGE_LOWER_BOUND, range_and_hash_schema.lower);
+    encoder.Add(RowOperationsPB::RANGE_UPPER_BOUND, range_and_hash_schema.upper);
+    for (const auto& hash_dimension : range_and_hash_schema.hash_schema) {
       auto* hash_dimension_pb = range->add_hash_schema();
       for (const string& col_name : hash_dimension.columns) {
         hash_dimension_pb->add_columns()->set_name(col_name);
@@ -492,23 +490,18 @@ TEST_F(TableLocationsTest, RangeSpecificHashingSameDimensions) {
   ASSERT_OK(bounds[4].first.SetStringNoCopy(0, "g"));
   ASSERT_OK(bounds[4].second.SetStringNoCopy(0, "h"));
 
-  vector<HashSchema> range_hash_schemas;
-  HashSchema hash_schema_5 = { { { "str_0", "int_1" }, 5, 0 } };
-  range_hash_schemas.emplace_back(hash_schema_5);
-  HashSchema hash_schema_4 = { { { "str_0" }, 4, 1 } };
-  range_hash_schemas.emplace_back(hash_schema_4);
-  HashSchema hash_schema_3 = { { { "int_1" }, 3, 2 } };
-  range_hash_schemas.emplace_back(hash_schema_3);
-  HashSchema hash_schema_6 = { { { "str_2", "str_0" }, 6, 3 } };
-  range_hash_schemas.emplace_back(hash_schema_6);
-
-  // Use 2 bucket hash schema as the table-wide one.
-  HashSchema table_hash_schema_2 = { { { "str_2" }, 2, 4 } };
-  // Apply table-wide hash schema applied to this range.
-  range_hash_schemas.emplace_back(table_hash_schema_2);
-
-  ASSERT_OK(CreateTable(
-      table_name, schema, {}, bounds, range_hash_schemas, table_hash_schema_2));
+  const HashSchema table_wide_hash_schema{ { { "str_2" }, 2, 4 } };
+
+  const vector<RangeWithHashSchema> ranges_with_hash_schemas{
+    { bounds[0].first, bounds[0].second, { { { "str_0", "int_1" }, 5, 0 } } },
+    { bounds[1].first, bounds[1].second, { { { "str_0" }, 4, 1 } } },
+    { bounds[2].first, bounds[2].second, { { { "int_1" }, 3, 2 } } },
+    { bounds[3].first, bounds[3].second, { { { "str_2", "str_0" }, 6, 3 } } },
+    { bounds[4].first, bounds[4].second, table_wide_hash_schema },
+  };
+
+  ASSERT_OK(CreateTable(table_name, schema, {}, {},
+                        ranges_with_hash_schemas, table_wide_hash_schema));
   NO_FATALS(CheckMasterTableCreation(table_name, 20));
 
   // The default setting for GetTableLocationsRequestPB::max_returned_locations
@@ -573,17 +566,16 @@ TEST_F(TableLocationsTest, RangeSpecificHashingVaryingDimensions) {
   ASSERT_OK(bounds[1].first.SetStringNoCopy(0, "c"));
   ASSERT_OK(bounds[1].second.SetStringNoCopy(0, "d"));
 
-  vector<HashSchema> range_hash_schemas;
-  HashSchema hash_schema_3_by_2 = { { { "key" }, 3, 0 }, { { "val" }, 2, 1 } };
-  range_hash_schemas.emplace_back(hash_schema_3_by_2);
+  const HashSchema table_wide_hash_schema{ { { "val" }, 4, 2 } };
 
-  // Use 4 bucket hash schema as the table-wide one.
-  HashSchema table_hash_schema_4 = { { { "val" }, 4, 2 } };
-  // Apply table-wide hash schema applied to this range.
-  range_hash_schemas.emplace_back(table_hash_schema_4);
+  const vector<RangeWithHashSchema> ranges_with_hash_schemas{
+    { bounds[0].first, bounds[0].second,
+          { { { { "key" }, 3, 0 }, { { "val" }, 2, 1 } } } },
+    { bounds[1].first, bounds[1].second, table_wide_hash_schema },
+  };
 
-  const auto s = CreateTable(
-      table_name, schema, {}, bounds, range_hash_schemas, table_hash_schema_4);
+  const auto s = CreateTable(table_name, schema, {}, {},
+                             ranges_with_hash_schemas, table_wide_hash_schema);
   ASSERT_TRUE(s.IsNotSupported()) << s.ToString();
   ASSERT_STR_CONTAINS(s.ToString(),
       "varying number of hash dimensions per range is not yet supported");
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 374587be6..726499d56 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -1874,7 +1874,7 @@ Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
       PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema),
       resp, MasterErrorPB::INVALID_SCHEMA));
 
-  // Decode split rows.
+  // Decode split rows and range bounds.
   vector<KuduPartialRow> split_rows;
   vector<pair<KuduPartialRow, KuduPartialRow>> range_bounds;
 
@@ -1883,7 +1883,7 @@ Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
   vector<DecodedRowOperation> ops;
   RETURN_NOT_OK(decoder.DecodeOperations<DecoderMode::SPLIT_ROWS>(&ops));
 
-  for (int i = 0; i < ops.size(); i++) {
+  for (size_t i = 0; i < ops.size(); ++i) {
     const DecodedRowOperation& op = ops[i];
     switch (op.type) {
       case RowOperationsPB::SPLIT_ROW: {
@@ -1920,21 +1920,15 @@ Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
   vector<Partition> partitions;
   if (const auto& ps = req.partition_schema();
       FLAGS_enable_per_range_hash_schemas && !ps.custom_hash_schema_ranges().empty()) {
-    // TODO(aserbin): in addition, should switch to specifying range information
-    //                only via 'PartitionSchemaPB::custom_hash_schema_ranges' or
-    //                'CreateTableRequestPB::split_rows_range_bounds', don't mix
     if (!split_rows.empty()) {
       return Status::InvalidArgument(
-          "both split rows and custom hash schema ranges cannot be "
+          "both split rows and custom hash schema ranges must not be "
           "populated at the same time");
     }
-    if (const auto ranges_with_hash_schemas_size =
-            partition_schema.ranges_with_hash_schemas().size();
-        range_bounds.size() != ranges_with_hash_schemas_size) {
+    if (!range_bounds.empty()) {
       return Status::InvalidArgument(
-          Substitute("$0 vs $1: per range hash schemas and range bounds "
-                     "must have the same size",
-                     ranges_with_hash_schemas_size, range_bounds.size()));
+          "both range bounds and custom hash schema ranges must not be "
+          "populated at the same time");
     }
     // Create partitions based on specified ranges with custom hash schemas.
     RETURN_NOT_OK(partition_schema.CreatePartitions(schema, &partitions));
@@ -1949,7 +1943,8 @@ Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
   // of hash dimensions as all the partitions with custom hash schemas.
   //
   // TODO(aserbin): remove the restriction once the rest of the code is ready
-  //                to handle range partitions with arbitrary hash schemas
+  //                to handle range partitions with arbitrary number of hash
+  //                dimensions in hash schemas
   CHECK(!partitions.empty());
   const auto hash_dimensions_num = partition_schema.hash_schema().size();
   for (const auto& p : partitions) {
diff --git a/src/kudu/master/master-test.cc b/src/kudu/master/master-test.cc
index d91dbb653..98ab68973 100644
--- a/src/kudu/master/master-test.cc
+++ b/src/kudu/master/master-test.cc
@@ -183,6 +183,12 @@ class MasterTest : public KuduTest {
   };
   typedef vector<HashDimension> HashSchema;
 
+  struct RangeWithHashSchema {
+    KuduPartialRow lower;
+    KuduPartialRow upper;
+    HashSchema hash_schema;
+  };
+
   void DoListTables(const ListTablesRequestPB& req, ListTablesResponsePB* resp);
   void DoListAllTables(ListTablesResponsePB* resp);
 
@@ -196,7 +202,7 @@ class MasterTest : public KuduTest {
                      const Schema& schema,
                      const vector<KuduPartialRow>& split_rows,
                      const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds = {},
-                     const vector<HashSchema>& range_hash_schemas = {});
+                     const vector<RangeWithHashSchema>& ranges_with_hash_schemas = {});
 
 
   Status CreateTable(const string& name,
@@ -206,7 +212,7 @@ class MasterTest : public KuduTest {
                      const optional<string>& comment,
                      const vector<KuduPartialRow>& split_rows,
                      const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds,
-                     const vector<HashSchema>& range_hash_schemas,
+                     const vector<RangeWithHashSchema>& ranges_with_hash_schemas,
                      const HashSchema& table_wide_hash_schema,
                      CreateTableResponsePB* resp);
 
@@ -237,10 +243,10 @@ Status MasterTest::CreateTable(
     const Schema& schema,
     const vector<KuduPartialRow>& split_rows,
     const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds,
-    const vector<HashSchema>& range_hash_schemas) {
+    const vector<RangeWithHashSchema>& ranges_with_hash_schemas) {
   CreateTableResponsePB resp;
-  return CreateTable(
-        name, schema, none, none, none, split_rows, bounds, range_hash_schemas, {}, &resp);
+  return CreateTable(name, schema, none, none, none, split_rows, bounds,
+                     ranges_with_hash_schemas, {}, &resp);
 }
 
 Status MasterTest::CreateTable(
@@ -251,15 +257,9 @@ Status MasterTest::CreateTable(
     const optional<string>& comment,
     const vector<KuduPartialRow>& split_rows,
     const vector<pair<KuduPartialRow, KuduPartialRow>>& bounds,
-    const vector<HashSchema>& range_hash_schemas,
+    const vector<RangeWithHashSchema>& ranges_with_hash_schemas,
     const HashSchema& table_wide_hash_schema,
     CreateTableResponsePB* resp) {
-
-  if (!range_hash_schemas.empty() && range_hash_schemas.size() != bounds.size()) {
-    return Status::InvalidArgument(
-        "'bounds' and 'range_hash_schemas' must be of the same size");
-  }
-
   CreateTableRequestPB req;
   req.set_name(name);
   if (type) {
@@ -276,7 +276,6 @@ Status MasterTest::CreateTable(
   }
 
   auto* ps_pb = req.mutable_partition_schema();
-
   for (const auto& hash_dimension : table_wide_hash_schema) {
     auto* hash_schema = ps_pb->add_hash_schema();
     for (const string& col_name : hash_dimension.columns) {
@@ -286,14 +285,12 @@ Status MasterTest::CreateTable(
     hash_schema->set_seed(hash_dimension.num_buckets);
   }
 
-  for (size_t i = 0; i < range_hash_schemas.size(); ++i) {
+  for (const auto& range_and_hs : ranges_with_hash_schemas) {
     auto* range = ps_pb->add_custom_hash_schema_ranges();
     RowOperationsPBEncoder encoder(range->mutable_range_bounds());
-    const auto& bound = bounds[i];
-    encoder.Add(RowOperationsPB::RANGE_LOWER_BOUND, bound.first);
-    encoder.Add(RowOperationsPB::RANGE_UPPER_BOUND, bound.second);
-    const auto& range_hash_schema = range_hash_schemas[i];
-    for (const auto& hash_dimension : range_hash_schema) {
+    encoder.Add(RowOperationsPB::RANGE_LOWER_BOUND, range_and_hs.lower);
+    encoder.Add(RowOperationsPB::RANGE_UPPER_BOUND, range_and_hs.upper);
+    for (const auto& hash_dimension : range_and_hs.hash_schema) {
       auto* hash_dimension_pb = range->add_hash_schema();
       for (const string& col_name : hash_dimension.columns) {
         hash_dimension_pb->add_columns()->set_name(col_name);
@@ -1031,26 +1028,49 @@ TEST_F(MasterTest, TestCreateTableCheckRangeInvariants) {
                         "least one range partition column");
   }
 
-  // No split rows and range specific hashing concurrently.
+  // In case of custom hash schemas per range, don't supply the information
+  // on split rows.
   {
     google::FlagSaver flag_saver;
-    FLAGS_enable_per_range_hash_schemas = true; // enable for testing.
+    FLAGS_enable_per_range_hash_schemas = true;
     KuduPartialRow split1(&kTableSchema);
     ASSERT_OK(split1.SetInt32("key", 1));
     KuduPartialRow a_lower(&kTableSchema);
     KuduPartialRow a_upper(&kTableSchema);
     ASSERT_OK(a_lower.SetInt32("key", 0));
     ASSERT_OK(a_upper.SetInt32("key", 100));
-    vector<HashSchema> range_hash_schemas = {{}};
-    Status s = CreateTable(kTableName,
-                           kTableSchema,
-                           { split1 },
-                           { { a_lower, a_upper } },
-                           range_hash_schemas);
+    const auto s = CreateTable(kTableName,
+                               kTableSchema,
+                               {split1},
+                               {},
+                               {{ a_lower, a_upper, {}}});
     ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
     ASSERT_STR_CONTAINS(s.ToString(),
                         "both split rows and custom hash schema ranges "
-                        "cannot be populated at the same time");
+                        "must not be populated at the same time");
+  }
+
+  // In case of custom hash schemas per range, don't supply the information
+  // on the range partition boundaries via both the
+  // CreateTableRequestPB::split_rows_range_bounds and the
+  // CreateTableRequestPB::partition_schema::custom_hash_schema_ranges fields.
+  {
+    google::FlagSaver flag_saver;
+    FLAGS_enable_per_range_hash_schemas = true;
+    KuduPartialRow a_lower(&kTableSchema);
+    KuduPartialRow a_upper(&kTableSchema);
+    ASSERT_OK(a_lower.SetInt32("key", 0));
+    ASSERT_OK(a_upper.SetInt32("key", 100));
+    vector<HashSchema> range_hash_schemas = {{}};
+    const auto s = CreateTable(kTableName,
+                               kTableSchema,
+                               vector<KuduPartialRow>(),
+                               {{ a_lower, a_upper }},
+                               {{ a_lower, a_upper, {}}});
+    ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(),
+                        "both range bounds and custom hash schema ranges "
+                        "must not be populated at the same time");
   }
 
   // No non-range columns.
@@ -1295,9 +1315,11 @@ TEST_F(MasterTest, NonPrimaryKeyColumnsForPerRangeCustomHashSchema) {
   KuduPartialRow upper(&kTableSchema);
   ASSERT_OK(lower.SetInt32("key", 0));
   ASSERT_OK(upper.SetInt32("key", 100));
-  vector<HashSchema> range_hash_schemas{{{{"int32_val"}, 2, 0}}};
-  const auto s = CreateTable(
-      kTableName, kTableSchema, {}, { { lower, upper } }, range_hash_schemas);
+  const auto s = CreateTable(kTableName,
+                             kTableSchema,
+                             {},
+                             {},
+                             {{ lower, upper, {{{"int32_val"}, 2, 0}}}});
   ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
   ASSERT_STR_CONTAINS(s.ToString(),
                       "must specify only primary key columns for "