You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:56 UTC

[arrow] 12/15: Adding comparison and concatenation

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 04893f65e92f57bba7b8ff0bbc201dfd17ff3aa0
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 17:04:47 2022 -0500

    Adding comparison and concatenation
---
 cpp/src/arrow/array/builder_binary.h    | 12 ++++++++----
 cpp/src/arrow/array/concatenate.cc      | 24 +++++++++++++++++++++++-
 cpp/src/arrow/array/concatenate_test.cc |  8 ++++++++
 cpp/src/arrow/compare.cc                |  8 +++++++-
 cpp/src/arrow/testing/random.cc         | 14 +++++++++++---
 cpp/src/arrow/testing/random.h          | 16 ++++++++++++++++
 6 files changed, 73 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 30ab4b9d4a..ccfcb8b2b2 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -542,9 +542,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
  public:
   using TypeClass = BinaryViewType;
 
-  BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+  // this constructor provided for MakeBuilder compatibility
+  BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool)
       : BinaryViewBuilder(pool) {}
 
+  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        data_builder_(pool, alignment),
+        data_heap_builder_(pool) {}
+
   int64_t current_block_bytes_remaining() const {
     return data_heap_builder_.current_remaining_bytes();
   }
@@ -683,9 +690,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
   std::shared_ptr<DataType> type() const override { return binary_view(); }
 
  protected:
-  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool())
-      : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {}
-
   static constexpr int64_t ValueSizeLimit() {
     return std::numeric_limits<uint32_t>::max();
   }
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index 3dd0ccea93..6f7d61283e 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -228,7 +228,29 @@ class ConcatenateImpl {
   }
 
   Status Visit(const BinaryViewType&) {
-    return Status::NotImplemented("binary / string view");
+    bool any_opted_out_of_view_validation = false;
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      auto begin = in_data->buffers.begin() + 2;
+      auto end = in_data->buffers.end();
+
+      if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) {
+        any_opted_out_of_view_validation = true;
+        --end;
+      }
+
+      for (auto it = begin; it != end; ++it) {
+        out_->buffers.push_back(*it);
+      }
+    }
+
+    if (any_opted_out_of_view_validation) {
+      out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers));
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader)));
+    return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]);
   }
 
   Status Visit(const ListType&) {
diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index bff5d7eec1..1bc0c65bec 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -91,6 +91,7 @@ class ConcatenateTest : public ::testing::Test {
       for (auto null_probability : this->null_probabilities_) {
         std::shared_ptr<Array> array;
         factory(size, null_probability, &array);
+          ASSERT_OK(array->ValidateFull());
         auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front());
         auto slices = this->Slices(array, offsets);
         ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
@@ -154,6 +155,13 @@ TEST_F(ConcatenateTest, StringType) {
   });
 }
 
+TEST_F(ConcatenateTest, StringViewType) {
+  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+    *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+    ASSERT_OK((**out).ValidateFull());
+  });
+}
+
 TEST_F(ConcatenateTest, LargeStringType) {
   Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
     *out =
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 68250f0288..5d1c3294c0 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -261,7 +261,13 @@ class RangeDataEqualsImpl {
 
   // Also matches StringViewType
   Status Visit(const BinaryViewType& type) {
-    return Status::NotImplemented("Binary / string view");
+    auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_;
+    auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_;
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return std::equal(left_values + i, left_values + i + length,
+                        right_values + i, right_values + i + length);
+    });
+    return Status::OK();
   }
 
   // Also matches LargeStringType
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 3213273474..e45e296ff6 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -362,13 +362,12 @@ std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType
   return gen.MakeRandomArray(size, null_probability, alignment, memory_pool);
 }
 
-template <typename TypeClass>
+template <typename TypeClass, typename offset_type = typename TypeClass::offset_type>
 static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size,
                                                   int32_t min_length, int32_t max_length,
                                                   double null_probability,
                                                   int64_t alignment,
                                                   MemoryPool* memory_pool) {
-  using offset_type = typename TypeClass::offset_type;
   using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
   using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
   using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
@@ -386,7 +385,7 @@ static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int
                  /*null_probability=*/0);
 
   std::vector<uint8_t> str_buffer(max_length);
-  BuilderType builder(memory_pool, alignment);
+  BuilderType builder{memory_pool, alignment};
 
   for (int64_t i = 0; i < size; ++i) {
     if (lengths->IsValid(i)) {
@@ -429,6 +428,15 @@ std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats(
   return *strings->View(binary());
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t min_length,
+                                                        int32_t max_length,
+                                                        double null_probability, 
+                                                        int64_t alignment,
+                                                        MemoryPool* memory_pool) {
+  return GenerateBinaryArray<StringViewType, uint32_t>(this, size, min_length, max_length,
+                                                       null_probability, alignment, memory_pool);
+}
+
 std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(
     int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
     double null_probability, int64_t alignment, MemoryPool* memory_pool) {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index b2e3a609a2..5b905896f2 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
                                 int64_t alignment = kDefaultBufferAlignment,
                                 MemoryPool* memory_pool = default_memory_pool());
 
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] null_probability the probability of a value being null
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length,
+                                    double null_probability = 0,
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
   /// \brief Generate a random LargeStringArray
   ///
   /// \param[in] size the size of the array to generate