You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/24 10:10:41 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #12032: ARROW-15126: [C++] Support Null type as group keys

pitrou commented on a change in pull request #12032:
URL: https://github.com/apache/arrow/pull/12032#discussion_r790574210



##########
File path: cpp/src/arrow/compute/exec/key_hash.cc
##########
@@ -286,34 +286,40 @@ void Hashing::HashMultiColumn(const std::vector<KeyEncoder::KeyColumnArray>& col
   bool is_first = true;
 
   for (size_t icol = 0; icol < cols.size(); ++icol) {
-    if (cols[icol].metadata().is_fixed_length) {
-      uint32_t col_width = cols[icol].metadata().fixed_length;
-      if (col_width == 0) {
-        util::bit_util::bits_to_bytes(ctx->hardware_flags, num_rows, cols[icol].data(1),
-                                      byte_temp_buf.mutable_data(),
-                                      cols[icol].bit_offset(1));
-      }
-      Hashing::hash_fixed(
-          ctx->hardware_flags, num_rows, col_width == 0 ? 1 : col_width,
-          col_width == 0 ? byte_temp_buf.mutable_data() : cols[icol].data(1),
-          is_first ? out_hash : hash_temp_buf.mutable_data());
+    // Set the hash value as zero for a null type col
+    if (cols[icol].metadata().is_null_type) {
+      uint32_t* dst_hash = is_first ? out_hash : hash_temp_buf.mutable_data();

Review comment:
       Can you factor out the `dst_hash` calculation for clarity?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -322,14 +333,20 @@ struct GrouperFastImpl : Grouper {
         group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
 
     for (int icol = 0; icol < num_columns; ++icol) {
-      const uint8_t* non_nulls = nullptr;
-      if (batch[icol].array()->buffers[0] != NULLPTR) {
-        non_nulls = batch[icol].array()->buffers[0]->data();
-      }
-      const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
-      const uint8_t* varlen = nullptr;
-      if (!col_metadata_[icol].is_fixed_length) {
-        varlen = batch[icol].array()->buffers[2]->data();
+      const uint8_t* non_nulls = NULLPTR;
+      const uint8_t* fixedlen = NULLPTR;
+      const uint8_t* varlen = NULLPTR;
+
+      // Skip if the key's type is NULL
+      if (key_types_[icol]->id() != Type::NA) {
+        if (batch[icol].array()->buffers[0] != NULLPTR) {
+          non_nulls = batch[icol].array()->buffers[0]->data();
+        }
+        auto array = batch[icol].array();

Review comment:
       What is this variable for? Did you mean to use it?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -322,14 +333,20 @@ struct GrouperFastImpl : Grouper {
         group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
 
     for (int icol = 0; icol < num_columns; ++icol) {
-      const uint8_t* non_nulls = nullptr;
-      if (batch[icol].array()->buffers[0] != NULLPTR) {
-        non_nulls = batch[icol].array()->buffers[0]->data();
-      }
-      const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
-      const uint8_t* varlen = nullptr;
-      if (!col_metadata_[icol].is_fixed_length) {
-        varlen = batch[icol].array()->buffers[2]->data();
+      const uint8_t* non_nulls = NULLPTR;
+      const uint8_t* fixedlen = NULLPTR;
+      const uint8_t* varlen = NULLPTR;
+
+      // Skip if the key's type is NULL
+      if (key_types_[icol]->id() != Type::NA) {
+        if (batch[icol].array()->buffers[0] != NULLPTR) {
+          non_nulls = batch[icol].array()->buffers[0]->data();
+        }
+        auto array = batch[icol].array();

Review comment:
       What is this variable for? Did you mean to use it?

##########
File path: cpp/src/arrow/compute/exec/key_compare.cc
##########
@@ -334,6 +334,14 @@ void KeyCompare::CompareColumnsToRows(uint32_t num_rows_to_compare,
   bool is_first_column = true;
   for (size_t icol = 0; icol < cols.size(); ++icol) {
     const KeyEncoder::KeyColumnArray& col = cols[icol];
+    if (col.metadata().is_null_type) {
+      // If this null type col is the first column, the match_bytevector_A needs to be
+      // initialized with 0xFF. Otherwise, the calculation can be skipped

Review comment:
       Why 0xFF? Can you explain?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -237,6 +245,9 @@ struct GrouperFastImpl : Grouper {
       } else if (is_binary_like(key->id())) {
         impl->col_metadata_[icol] =
             arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
+      } else if (key->id() == Type::NA) {
+        impl->col_metadata_[icol] =
+            arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0, true);

Review comment:
       Can you please denominate arguments for clarity? Such as:
   ```c++
           impl->col_metadata_[icol] =
               arrow::compute::KeyEncoder::KeyColumnMetadata(/*arg1=*/true, 0, /*arg2=*/true);
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org