You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/08/03 19:12:05 UTC

[kudu] 02/02: KUDU-2671 range-specific hash schemas in 'kudu table create' CLI

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 18d40679a441068072856995a3fb6d4cabfe6d68
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Fri Jul 29 14:07:37 2022 -0700

    KUDU-2671 range-specific hash schemas in 'kudu table create' CLI
    
    This patch add support for range-specific hash schemas into the
    'kudu table create' CLI tool.  This patch also contains a test scenario
    to cover the newly introduced functionality.
    
    Change-Id: I94aab482792ef93754b6475e1390b8f0c4a05678
    Reviewed-on: http://gerrit.cloudera.org:8080/18809
    Reviewed-by: Mahesh Reddy <mr...@cloudera.com>
    Tested-by: Alexey Serbin <al...@apache.org>
    Reviewed-by: Yingchun Lai <ac...@gmail.com>
    Reviewed-by: Abhishek Chennaka <ac...@cloudera.com>
    Reviewed-by: Attila Bukor <ab...@apache.org>
---
 src/kudu/tools/create-table-tool-test.cc | 130 +++++++++++++++++++++++++++----
 src/kudu/tools/tool.proto                |  40 ++++++----
 src/kudu/tools/tool_action_table.cc      |  64 +++++++++++++--
 3 files changed, 198 insertions(+), 36 deletions(-)

diff --git a/src/kudu/tools/create-table-tool-test.cc b/src/kudu/tools/create-table-tool-test.cc
index d592638d7..9ce59a98d 100644
--- a/src/kudu/tools/create-table-tool-test.cc
+++ b/src/kudu/tools/create-table-tool-test.cc
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
+#include <cstdint>
 #include <cstdio>
 #include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -99,28 +100,32 @@ TEST_F(CreateTableToolTest, TestCreateTable) {
 
   // Test a few good cases.
   const auto check_good_input = [&](const string& json_str,
-                                    const string& master,
+                                    const string& master_rpc_addr,
                                     const string& table_name,
-                                    const string& schema,
-                                    const string& partition,
+                                    const string& schema_str,
+                                    const string& partition_str,
                                     const map<string, string>& extra_configs,
-                                    KuduClient* client) {
+                                    KuduClient* client,
+                                    shared_ptr<KuduTable>* table_out = nullptr) {
     const vector<string> table_args = {
-        "table", "create", master, json_str
+        "table", "create", master_rpc_addr, json_str
     };
     bool table_exists = false;
     ASSERT_OK(RunKuduTool(table_args));
     ASSERT_EVENTUALLY([&] {
-        ASSERT_OK(client->TableExists(table_name, &table_exists));
-        ASSERT_TRUE(table_exists);
+      ASSERT_OK(client->TableExists(table_name, &table_exists));
+      ASSERT_TRUE(table_exists);
     });
     shared_ptr<KuduTable> table;
     ASSERT_OK(client->OpenTable(table_name, &table));
-    ASSERT_EQ(table->name(), table_name);
-    ASSERT_EQ(table->schema().ToString(), schema);
-    ASSERT_EQ(table->partition_schema().DebugString(KuduSchema::ToSchema(
-        table->schema())), partition);
-    ASSERT_EQ(table->extra_configs(), extra_configs);
+    ASSERT_EQ(table_name, table->name());
+    ASSERT_EQ(schema_str, table->schema().ToString());
+    ASSERT_EQ(partition_str, table->partition_schema().DebugString(
+                KuduSchema::ToSchema(table->schema())));
+    ASSERT_EQ(extra_configs, table->extra_configs());
+    if (table_out) {
+      *table_out = std::move(table);
+    }
   };
 
   // Create a simple table.
@@ -613,6 +618,105 @@ TEST_F(CreateTableToolTest, TestCreateTable) {
   NO_FATALS(check_good_input(encoding_type_unknown, master_addr,
       "encoding_type_unknown", schema, partition, {}, client.get()));
 
+  // Create a table with a range having custom hash schema.
+  const string range_with_custom_hash_schema = R"(
+      {
+          "table_name": "range_with_custom_hash_schema",
+          "schema": {
+              "columns": [
+                  {
+                      "column_name": "id",
+                      "column_type": "INT32",
+                      "is_nullable": false,
+                  },
+                  {
+                      "column_name": "name",
+                      "column_type": "STRING",
+                      "is_nullable": true,
+                  }
+              ],
+              "key_column_names": [
+                  "id"
+              ]
+          },
+          "partition": {
+              "hash_partitions": [
+                  {
+                      "columns": ["id"],
+                      "num_buckets": 2,
+                      "seed": 1
+                  }
+              ],
+              "range_partition": {
+                  "columns": ["id"],
+                  "range_bounds": [
+                      {
+                          "upper_bound": {
+                              "bound_values": ["-100"],
+                              "bound_type": "EXCLUSIVE"
+                          }
+                      },
+                      {
+                          "lower_bound": {
+                              "bound_values": ["100"],
+                              "bound_type": "INCLUSIVE"
+                          }
+                      }
+                  ],
+                  "custom_hash_schema_ranges": [
+                      {
+                          "range_bounds": {
+                              "lower_bound": {
+                                  "bound_values": ["-100"],
+                                  "bound_type": "INCLUSIVE"
+                              },
+                              "upper_bound": {
+                                  "bound_values": ["100"],
+                                  "bound_type": "EXCLUSIVE"
+                              }
+                          },
+                          "hash_schema": {
+                              "columns": ["id"],
+                              "num_buckets": 5,
+                              "seed": 8
+                          }
+                      }
+                  ]
+              }
+          }
+      }
+  )";
+  {
+    constexpr const char* const kRefSchema =
+        "(\n"
+        "    id INT32 NOT NULL,\n"
+        "    name STRING NULLABLE,\n"
+        "    PRIMARY KEY (id)\n)";
+    constexpr const char* const kRefPartitionInfo =
+        "HASH (id) PARTITIONS 2 SEED 1, RANGE (id)";
+    shared_ptr<KuduTable> table;
+    NO_FATALS(check_good_input(range_with_custom_hash_schema,
+                               master_addr,
+                               "range_with_custom_hash_schema",
+                               kRefSchema,
+                               kRefPartitionInfo,
+                               {},
+                               client.get(),
+                               &table));
+    vector<Partition> partitions;
+    ASSERT_OK(table->ListPartitions(&partitions));
+    ASSERT_EQ(9, partitions.size());
+    vector<int32_t> bucket_nums;
+    for (const auto& p : partitions) {
+      // All hash schemas in this table are one-dimensional.
+      ASSERT_EQ(1, p.hash_buckets().size());
+      bucket_nums.emplace_back(p.hash_buckets().front());
+    }
+    std::sort(bucket_nums.begin(), bucket_nums.end());
+    const vector<int32_t> ref_bucket_nums{0, 0, 0, 1, 1, 1, 2, 3, 4};
+    ASSERT_EQ(ref_bucket_nums, bucket_nums);
+  }
+
   // Test a few error cases.
   const auto check_bad_input = [&](const string& json_str,
                                    const string& master,
diff --git a/src/kudu/tools/tool.proto b/src/kudu/tools/tool.proto
index dc735e137..f98d8cd84 100644
--- a/src/kudu/tools/tool.proto
+++ b/src/kudu/tools/tool.proto
@@ -398,6 +398,20 @@ message ColumnPB {
 }
 
 message PartitionPB {
+  message HashPartitionPB {
+    // Column names of columns included in the hash. Every column must be
+    // a component of the primary key.
+    repeated string columns = 1;
+    // Number of buckets into which columns will be hashed. Must be at least 2.
+    optional int32 num_buckets = 2;
+    // Seed value for hash calculation. Administrators may set a seed value
+    // on a per-table basis in order to randomize the mapping of rows to
+    // buckets. Setting a seed provides some amount of protection against denial
+    // of service attacks when the hash bucket columns contain user provided
+    // input.
+    optional uint32 seed = 3;
+  }
+
   message RangePartitionPB {
     message BoundPB {
       enum Type {
@@ -422,6 +436,12 @@ message PartitionPB {
       // exact string value for the bound.
       repeated string split_values = 1;
     }
+    message RangeWithHashSchemaPB {
+      // The bounds of this range.
+      optional RangeBoundPB range_bounds = 1;
+      // Hash schema for this range.
+      repeated HashPartitionPB hash_schema = 2;
+    }
 
     // Column names of columns included in the range. All columns must be
     // a component of the primary key.
@@ -430,25 +450,13 @@ message PartitionPB {
     repeated RangeBoundPB range_bounds = 2;
     // Range splits.
     repeated SplitValuePB range_splits = 3;
+    // Ranges with custom hash schemas.
+    repeated RangeWithHashSchemaPB custom_hash_schema_ranges = 4;
   }
 
-  message HashPartitionPB {
-    // Column names of columns included in the hash. Every column must be
-    // a component of the primary key.
-    repeated string columns = 1;
-    // Number of buckets into which columns will be hashed. Must be at least 2.
-    optional int32 num_buckets = 2;
-    // Seed value for hash calculation. Administrators may set a seed value
-    // on a per-table basis in order to randomize the mapping of rows to
-    // buckets. Setting a seed provides some amount of protection against denial
-    // of service attacks when the hash bucket columns contain user provided
-    // input.
-    optional uint32 seed = 3;
-  }
-
-  // Hash partition message. Support zero or more hash partition levels .
+  // Table-wide hash schema.
   repeated HashPartitionPB hash_partitions = 1;
-  // range partition message.
+  // Range partitioning information.
   optional RangePartitionPB range_partition = 2;
 }
 
diff --git a/src/kudu/tools/tool_action_table.cc b/src/kudu/tools/tool_action_table.cc
index a03c23574..b764781de 100644
--- a/src/kudu/tools/tool_action_table.cc
+++ b/src/kudu/tools/tool_action_table.cc
@@ -72,6 +72,7 @@ using kudu::client::KuduColumnSchema;
 using kudu::client::KuduColumnSpec;
 using kudu::client::KuduColumnStorageAttributes;
 using kudu::client::KuduPredicate;
+using kudu::client::KuduRangePartition;
 using kudu::client::KuduScanToken;
 using kudu::client::KuduScanTokenBuilder;
 using kudu::client::KuduScanner;
@@ -1491,11 +1492,8 @@ Status ParseTablePartition(const PartitionPB& partition,
   string bound_partial_row_json;
   for (const auto& bound : partition.range_partition().range_bounds()) {
     unique_ptr<KuduPartialRow> lower_bound(kudu_schema.NewRow());
-    unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow());
     KuduTableCreator::RangePartitionBound lower_bound_type =
         KuduTableCreator::INCLUSIVE_BOUND;
-    KuduTableCreator::RangePartitionBound upper_bound_type =
-        KuduTableCreator::EXCLUSIVE_BOUND;
     if (bound.has_lower_bound()) {
       RETURN_NOT_OK(ToJsonPartialRow(bound.lower_bound().bound_values(),
                                      range_col_names_and_types,
@@ -1503,9 +1501,13 @@ Status ParseTablePartition(const PartitionPB& partition,
       RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types,
                                             bound_partial_row_json,
                                             lower_bound.get()));
-      RETURN_NOT_OK(ToClientRangePartitionBound(bound.lower_bound().bound_type(),
-                                                &lower_bound_type));
+      RETURN_NOT_OK(ToClientRangePartitionBound(
+          bound.lower_bound().bound_type(), &lower_bound_type));
     }
+
+    unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow());
+    KuduTableCreator::RangePartitionBound upper_bound_type =
+        KuduTableCreator::EXCLUSIVE_BOUND;
     if (bound.has_upper_bound()) {
       RETURN_NOT_OK(ToJsonPartialRow(bound.upper_bound().bound_values(),
                                      range_col_names_and_types,
@@ -1513,12 +1515,60 @@ Status ParseTablePartition(const PartitionPB& partition,
       RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types,
                                             bound_partial_row_json,
                                             upper_bound.get()));
-      RETURN_NOT_OK(ToClientRangePartitionBound(bound.upper_bound().bound_type(),
-                                                &upper_bound_type));
+      RETURN_NOT_OK(ToClientRangePartitionBound(
+          bound.upper_bound().bound_type(), &upper_bound_type));
     }
+
     table_creator->add_range_partition(lower_bound.release(), upper_bound.release(),
         lower_bound_type, upper_bound_type);
   }
+
+  for (const auto& range : partition.range_partition().custom_hash_schema_ranges()) {
+    const auto& bounds = range.range_bounds();
+
+    unique_ptr<KuduPartialRow> lower_bound(kudu_schema.NewRow());
+    KuduTableCreator::RangePartitionBound lower_bound_type =
+        KuduTableCreator::INCLUSIVE_BOUND;
+    if (bounds.has_lower_bound()) {
+      RETURN_NOT_OK(ToJsonPartialRow(bounds.lower_bound().bound_values(),
+                                     range_col_names_and_types,
+                                     &bound_partial_row_json));
+      RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types,
+                                            bound_partial_row_json,
+                                            lower_bound.get()));
+      RETURN_NOT_OK(ToClientRangePartitionBound(
+          bounds.lower_bound().bound_type(), &lower_bound_type));
+    }
+
+    unique_ptr<KuduPartialRow> upper_bound(kudu_schema.NewRow());
+    KuduTableCreator::RangePartitionBound upper_bound_type =
+        KuduTableCreator::EXCLUSIVE_BOUND;
+    if (bounds.has_upper_bound()) {
+      RETURN_NOT_OK(ToJsonPartialRow(bounds.upper_bound().bound_values(),
+                                     range_col_names_and_types,
+                                     &bound_partial_row_json));
+      RETURN_NOT_OK(ConvertToKuduPartialRow(range_col_names_and_types,
+                                            bound_partial_row_json,
+                                            upper_bound.get()));
+      RETURN_NOT_OK(ToClientRangePartitionBound(
+          bounds.upper_bound().bound_type(), &upper_bound_type));
+    }
+
+    unique_ptr<KuduRangePartition> partition(
+        new KuduRangePartition(lower_bound.release(), upper_bound.release(),
+                               lower_bound_type, upper_bound_type));
+    for (const auto& hash_dimension : range.hash_schema()) {
+      vector<string> hash_columns;
+      for (const auto& c : hash_dimension.columns()) {
+        hash_columns.emplace_back(c);
+      }
+      const int32_t seed = hash_dimension.has_seed() ? hash_dimension.seed() : 0;
+      partition->add_hash_partitions(
+          hash_columns, hash_dimension.num_buckets(), seed);
+    }
+
+    table_creator->add_custom_range_partition(partition.release());
+  }
   for (const auto& split_pb : partition.range_partition().range_splits()) {
     RETURN_NOT_OK(ToJsonPartialRow(split_pb.split_values(),
                                    range_col_names_and_types,