You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/14 21:48:57 UTC

[GitHub] [arrow] lidavidm commented on a diff in pull request #12872: ARROW-16166: [C++][Compute] Utilities for assembling join output

lidavidm commented on code in PR #12872:
URL: https://github.com/apache/arrow/pull/12872#discussion_r850827924


##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+/// This file contains lightweight containers for Arrow buffers.  These containers
+/// makes compromises in terms of strong ownership and the range of data types supported
+/// in order to gain performance and reduced overhead.

Review Comment:
   How do these compare to a theoretical ExecBatch refactor?



##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+/// This file contains lightweight containers for Arrow buffers.  These containers
+/// makes compromises in terms of strong ownership and the range of data types supported
+/// in order to gain performance and reduced overhead.
+
+namespace arrow {
+namespace compute {
+
+/// \brief Description of the layout of a "key" column
+///
+/// A "key" column is a non-nested, non-union column.
+/// Every key column has either 0 (null), 2 (e.g. int32) or 3 (e.g. string) buffers
+/// and no children.
+///
+/// This metadata object is a zero-allocation analogue of arrow::DataType
+struct KeyColumnMetadata {
+  KeyColumnMetadata() = default;
+  KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in,
+                    bool is_null_type_in = false)
+      : is_fixed_length(is_fixed_length_in),
+        is_null_type(is_null_type_in),
+        fixed_length(fixed_length_in) {}
+  /// \brief True if the column is not a varying-length binary type
+  ///
+  /// If this is true the column will have a validity buffer and
+  /// a data buffer and the third buffer will be unused.
+  bool is_fixed_length;
+  /// \brief True if this column is the null type
+  bool is_null_type;
+  /// \brief The number of bytes for each item
+  ///
+  /// Zero has a special meaning, indicating a bit vector with one bit per value if it
+  /// isn't a null type column.
+  ///
+  /// For a varying-length binary column this represents the number of bytes per offset.
+  uint32_t fixed_length;
+};
+
+/// \brief A lightweight view into a "key" array
+///
+/// A "key" column is a non-nested, non-union column \see KeyColumnMetadata
+///
+/// This metadata object is a zero-allocation analogue of arrow::ArrayData
+class KeyColumnArray {
+ public:
+  /// \brief Create an uninitialized KeyColumnArray
+  KeyColumnArray() = default;
+  /// \brief Create a read-only view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+                 const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
+                 int bit_offset0 = 0, int bit_offset1 = 0);
+  /// \brief Create a mutable view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
+                 uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+                 int bit_offset1 = 0);
+  /// \brief Create a sliced view of `this`
+  ///
+  /// The number of rows used in offset must be divisible by 8
+  /// in order to not split bit vectors within a single byte.
+  KeyColumnArray Slice(int64_t offset, int64_t length) const;
+  /// \brief Create a copy of `this` with a buffer from `other`
+  ///
+  /// The copy will be identical to `this` except the buffer at buffer_id_to_replace
+  /// will be replaced by the corresponding buffer in `other`.
+  KeyColumnArray WithBufferFrom(const KeyColumnArray& other,
+                                int buffer_id_to_replace) const;
+
+  /// \brief Create a copy of `this` with new metadata
+  KeyColumnArray WithMetadata(const KeyColumnMetadata& metadata) const;
+  /// \brief Return one of the underlying mutable buffers
+  uint8_t* mutable_data(int i) {
+    ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+    return mutable_buffers_[i];
+  }
+  /// \brief Return one of the underlying read-only buffers
+  const uint8_t* data(int i) const {

Review Comment:
   Given there's already `offsets` maybe it would be better to just have 3/6 accessors and not restrict the accesors based on is_fixed_length



##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+/// This file contains lightweight containers for Arrow buffers.  These containers
+/// makes compromises in terms of strong ownership and the range of data types supported
+/// in order to gain performance and reduced overhead.

Review Comment:
   I suppose these are far too limited in terms of data type support



##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+/// This file contains lightweight containers for Arrow buffers.  These containers
+/// makes compromises in terms of strong ownership and the range of data types supported
+/// in order to gain performance and reduced overhead.
+
+namespace arrow {
+namespace compute {
+
+/// \brief Description of the layout of a "key" column
+///
+/// A "key" column is a non-nested, non-union column.
+/// Every key column has either 0 (null), 2 (e.g. int32) or 3 (e.g. string) buffers
+/// and no children.
+///
+/// This metadata object is a zero-allocation analogue of arrow::DataType
+struct KeyColumnMetadata {
+  KeyColumnMetadata() = default;
+  KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in,
+                    bool is_null_type_in = false)
+      : is_fixed_length(is_fixed_length_in),
+        is_null_type(is_null_type_in),
+        fixed_length(fixed_length_in) {}
+  /// \brief True if the column is not a varying-length binary type
+  ///
+  /// If this is true the column will have a validity buffer and
+  /// a data buffer and the third buffer will be unused.
+  bool is_fixed_length;
+  /// \brief True if this column is the null type
+  bool is_null_type;
+  /// \brief The number of bytes for each item
+  ///
+  /// Zero has a special meaning, indicating a bit vector with one bit per value if it
+  /// isn't a null type column.
+  ///
+  /// For a varying-length binary column this represents the number of bytes per offset.
+  uint32_t fixed_length;
+};
+
+/// \brief A lightweight view into a "key" array
+///
+/// A "key" column is a non-nested, non-union column \see KeyColumnMetadata
+///
+/// This metadata object is a zero-allocation analogue of arrow::ArrayData
+class KeyColumnArray {
+ public:
+  /// \brief Create an uninitialized KeyColumnArray
+  KeyColumnArray() = default;
+  /// \brief Create a read-only view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+                 const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
+                 int bit_offset0 = 0, int bit_offset1 = 0);
+  /// \brief Create a mutable view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
+                 uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+                 int bit_offset1 = 0);
+  /// \brief Create a sliced view of `this`
+  ///
+  /// The number of rows used in offset must be divisible by 8
+  /// in order to not split bit vectors within a single byte.
+  KeyColumnArray Slice(int64_t offset, int64_t length) const;
+  /// \brief Create a copy of `this` with a buffer from `other`
+  ///
+  /// The copy will be identical to `this` except the buffer at buffer_id_to_replace
+  /// will be replaced by the corresponding buffer in `other`.
+  KeyColumnArray WithBufferFrom(const KeyColumnArray& other,
+                                int buffer_id_to_replace) const;
+
+  /// \brief Create a copy of `this` with new metadata
+  KeyColumnArray WithMetadata(const KeyColumnMetadata& metadata) const;
+  /// \brief Return one of the underlying mutable buffers
+  uint8_t* mutable_data(int i) {
+    ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+    return mutable_buffers_[i];
+  }
+  /// \brief Return one of the underlying read-only buffers
+  const uint8_t* data(int i) const {
+    ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+    return buffers_[i];
+  }
+  /// \brief Return a mutable version of the offsets buffer
+  ///
+  /// Only valid if this is a view into a varbinary type
+  uint32_t* mutable_offsets() {
+    DCHECK(!metadata_.is_fixed_length);
+    return reinterpret_cast<uint32_t*>(mutable_data(1));
+  }
+  /// \brief Return a read-only version of the offsets buffer
+  ///
+  /// Only valid if this is a view into a varbinary type
+  const uint32_t* offsets() const {
+    DCHECK(!metadata_.is_fixed_length);
+    return reinterpret_cast<const uint32_t*>(data(1));
+  }
+  /// \brief Return the type metadata
+  const KeyColumnMetadata& metadata() const { return metadata_; }
+  /// \brief Return the length (in rows) of the array
+  int64_t length() const { return length_; }
+  /// \brief Return the bit offset into the corresponding vector
+  ///
+  /// if i == 1 then this must be a bool array
+  int bit_offset(int i) const {
+    ARROW_DCHECK(i >= 0 && i < max_buffers_);
+    return bit_offset_[i];
+  }
+
+ private:
+  static constexpr int max_buffers_ = 3;
+  const uint8_t* buffers_[max_buffers_];
+  uint8_t* mutable_buffers_[max_buffers_];
+  KeyColumnMetadata metadata_;
+  int64_t length_;
+  // Starting bit offset within the first byte (between 0 and 7)
+  // to be used when accessing buffers that store bit vectors.
+  int bit_offset_[max_buffers_ - 1];
+};
+
+/// \brief Create KeyColumnMetadata from a DataType
+///
+/// If `type` is a dictionary type then this will return the KeyColumnMetadata for
+/// the indices type
+///
+/// The caller should ensure this is only called on "key" columns.  Calling this with
+/// a non-key column will return a meaningless value (or abort on a debug build)

Review Comment:
   Is this worth it compared to the usual Result<> pattern?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -0,0 +1,504 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/light_array.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+namespace compute {
+
+const std::vector<std::shared_ptr<DataType>> kSampleFixedDataTypes = {
+    int8(),   int16(),  int32(),  int64(),           uint8(),
+    uint16(), uint32(), uint64(), decimal128(38, 6), decimal256(76, 6)};
+const std::vector<std::shared_ptr<DataType>> kSampleBinaryTypes = {
+    utf8(), binary() /*, large_utf8(), large_binary()*/};

Review Comment:
   Why are these commented out?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -0,0 +1,504 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/light_array.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+namespace compute {
+
+const std::vector<std::shared_ptr<DataType>> kSampleFixedDataTypes = {
+    int8(),   int16(),  int32(),  int64(),           uint8(),
+    uint16(), uint32(), uint64(), decimal128(38, 6), decimal256(76, 6)};
+const std::vector<std::shared_ptr<DataType>> kSampleBinaryTypes = {
+    utf8(), binary() /*, large_utf8(), large_binary()*/};
+
+TEST(KeyColumnMetadata, FromDataType) {
+  KeyColumnMetadata metadata = ColumnMetadataFromDataType(boolean());
+  ASSERT_EQ(0, metadata.fixed_length);
+  ASSERT_EQ(true, metadata.is_fixed_length);
+  ASSERT_EQ(false, metadata.is_null_type);
+
+  metadata = ColumnMetadataFromDataType(null());
+  ASSERT_EQ(true, metadata.is_null_type);
+
+  for (const auto& type : kSampleFixedDataTypes) {
+    int byte_width =
+        internal::checked_pointer_cast<FixedWidthType>(type)->bit_width() / 8;
+    metadata = ColumnMetadataFromDataType(type);
+    ASSERT_EQ(byte_width, metadata.fixed_length);
+    ASSERT_EQ(true, metadata.is_fixed_length);
+    ASSERT_EQ(false, metadata.is_null_type);
+  }
+
+  for (const auto& type : {binary(), utf8()}) {
+    metadata = ColumnMetadataFromDataType(type);
+    ASSERT_EQ(4, metadata.fixed_length);
+    ASSERT_EQ(false, metadata.is_fixed_length);
+    ASSERT_EQ(false, metadata.is_null_type);
+  }
+
+  for (const auto& type : {large_binary(), large_utf8()}) {
+    metadata = ColumnMetadataFromDataType(type);
+    ASSERT_EQ(8, metadata.fixed_length);
+    ASSERT_EQ(false, metadata.is_fixed_length);
+    ASSERT_EQ(false, metadata.is_null_type);
+  }
+}
+
+TEST(KeyColumnArray, FromArrayData) {
+  for (const auto& type : kSampleFixedDataTypes) {
+    ARROW_SCOPED_TRACE("Type: ", type->ToString());
+    // `array_offset` is the offset of the source array (e.g. if we are given a sliced
+    // source array) while `offset` is the offset we pass when constructing the
+    // KeyColumnArray
+    for (auto array_offset : {0, 1}) {
+      ARROW_SCOPED_TRACE("Array offset: ", array_offset);
+      for (auto offset : {0, 1}) {
+        ARROW_SCOPED_TRACE("Constructor offset: ", offset);
+        std::shared_ptr<Array> array;
+        int byte_width =
+            internal::checked_pointer_cast<FixedWidthType>(type)->bit_width() / 8;
+        if (is_decimal(type->id())) {
+          array = ArrayFromJSON(type, R"(["1.123123", "2.123123", null])");
+        } else {
+          array = ArrayFromJSON(type, "[1, 2, null]");
+        }
+        array = array->Slice(array_offset);
+        int length = static_cast<int32_t>(array->length()) - offset - array_offset;
+        int buffer_offset_bytes = (offset + array_offset) * byte_width;
+        KeyColumnArray kc_array = ColumnArrayFromArrayData(array->data(), offset, length);
+        // Maximum tested offset is < 8 so validity is just bit offset
+        ASSERT_EQ(offset + array_offset, kc_array.bit_offset(0));
+        ASSERT_EQ(0, kc_array.bit_offset(1));
+        ASSERT_EQ(array->data()->buffers[0]->data(), kc_array.data(0));
+        ASSERT_EQ(array->data()->buffers[1]->data() + buffer_offset_bytes,
+                  kc_array.data(1));
+        ASSERT_EQ(nullptr, kc_array.data(2));
+        ASSERT_EQ(length, kc_array.length());
+        // When creating from ArrayData always create read-only
+        ASSERT_EQ(nullptr, kc_array.mutable_data(0));
+        ASSERT_EQ(nullptr, kc_array.mutable_data(1));
+        ASSERT_EQ(nullptr, kc_array.mutable_data(2));
+      }
+    }
+  }
+}
+
+TEST(KeyColumnArray, FromArrayDataBinary) {
+  for (const auto& type : kSampleBinaryTypes) {
+    ARROW_SCOPED_TRACE("Type: ", type->ToString());
+    for (auto array_offset : {0, 1}) {
+      ARROW_SCOPED_TRACE("Array offset: ", array_offset);
+      for (auto offset : {0, 1}) {
+        ARROW_SCOPED_TRACE("Constructor offset: ", offset);
+        std::shared_ptr<Array> array = ArrayFromJSON(type, R"(["xyz", "abcabc", null])");
+        int offsets_width =
+            static_cast<int>(internal::checked_pointer_cast<BaseBinaryType>(type)
+                                 ->layout()
+                                 .buffers[1]
+                                 .byte_width);
+        array = array->Slice(array_offset);
+        int length = static_cast<int32_t>(array->length()) - offset - array_offset;
+        int buffer_offset_bytes = (offset + array_offset) * offsets_width;
+        KeyColumnArray kc_array = ColumnArrayFromArrayData(array->data(), offset, length);
+        ASSERT_EQ(offset + array_offset, kc_array.bit_offset(0));
+        ASSERT_EQ(0, kc_array.bit_offset(1));
+        ASSERT_EQ(array->data()->buffers[0]->data(), kc_array.data(0));
+        ASSERT_EQ(array->data()->buffers[1]->data() + buffer_offset_bytes,
+                  kc_array.data(1));
+        ASSERT_EQ(array->data()->buffers[2]->data(), kc_array.data(2));
+        ASSERT_EQ(length, kc_array.length());
+        // When creating from ArrayData always create read-only
+        ASSERT_EQ(nullptr, kc_array.mutable_data(0));
+        ASSERT_EQ(nullptr, kc_array.mutable_data(1));
+        ASSERT_EQ(nullptr, kc_array.mutable_data(2));
+      }
+    }
+  }
+}
+
+TEST(KeyColumnArray, FromArrayDataBool) {
+  for (auto array_offset : {0, 1}) {
+    ARROW_SCOPED_TRACE("Array offset: ", array_offset);
+    for (auto offset : {0, 1}) {
+      ARROW_SCOPED_TRACE("Constructor offset: ", offset);
+      std::shared_ptr<Array> array = ArrayFromJSON(boolean(), "[true, false, null]");
+      array = array->Slice(array_offset);
+      int length = static_cast<int32_t>(array->length()) - offset - array_offset;
+      KeyColumnArray kc_array = ColumnArrayFromArrayData(array->data(), offset, length);
+      ASSERT_EQ(offset + array_offset, kc_array.bit_offset(0));
+      ASSERT_EQ(offset + array_offset, kc_array.bit_offset(1));
+      ASSERT_EQ(array->data()->buffers[0]->data(), kc_array.data(0));
+      ASSERT_EQ(array->data()->buffers[1]->data(), kc_array.data(1));
+      ASSERT_EQ(length, kc_array.length());
+      ASSERT_EQ(nullptr, kc_array.mutable_data(0));
+      ASSERT_EQ(nullptr, kc_array.mutable_data(1));
+    }
+  }
+}
+
+// TEST(KeyColumnArray, FromArrayDataNull) {

Review Comment:
   I also see above that dictionary type may be handled?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org