You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/18 22:05:22 UTC

[arrow] branch feature/format-string-view updated: Adding comparison and concatenation

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/feature/format-string-view by this push:
     new 7435aa0821 Adding comparison and concatenation
7435aa0821 is described below

commit 7435aa0821706a0f2613b87433b1b91c94669cbc
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 17:04:47 2022 -0500

    Adding comparison and concatenation
---
 cpp/src/arrow/array/concatenate.cc      | 24 +++++++++++++++++++++++-
 cpp/src/arrow/array/concatenate_test.cc |  8 ++++++++
 cpp/src/arrow/compare.cc                |  8 +++++++-
 cpp/src/arrow/testing/random.cc         | 10 ++++++++--
 cpp/src/arrow/testing/random.h          | 13 +++++++++++++
 5 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index 3dd0ccea93..6f7d61283e 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -228,7 +228,29 @@ class ConcatenateImpl {
   }
 
   Status Visit(const BinaryViewType&) {
-    return Status::NotImplemented("binary / string view");
+    bool any_opted_out_of_view_validation = false;
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      auto begin = in_data->buffers.begin() + 2;
+      auto end = in_data->buffers.end();
+
+      if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) {
+        any_opted_out_of_view_validation = true;
+        --end;
+      }
+
+      for (auto it = begin; it != end; ++it) {
+        out_->buffers.push_back(*it);
+      }
+    }
+
+    if (any_opted_out_of_view_validation) {
+      out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers));
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader)));
+    return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]);
   }
 
   Status Visit(const ListType&) {
diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index bff5d7eec1..1bc0c65bec 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -91,6 +91,7 @@ class ConcatenateTest : public ::testing::Test {
       for (auto null_probability : this->null_probabilities_) {
         std::shared_ptr<Array> array;
         factory(size, null_probability, &array);
+          ASSERT_OK(array->ValidateFull());
         auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front());
         auto slices = this->Slices(array, offsets);
         ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
@@ -154,6 +155,13 @@ TEST_F(ConcatenateTest, StringType) {
   });
 }
 
+TEST_F(ConcatenateTest, StringViewType) {
+  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+    *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+    ASSERT_OK((**out).ValidateFull());
+  });
+}
+
 TEST_F(ConcatenateTest, LargeStringType) {
   Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
     *out =
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 68250f0288..5d1c3294c0 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -261,7 +261,13 @@ class RangeDataEqualsImpl {
 
   // Also matches StringViewType
   Status Visit(const BinaryViewType& type) {
-    return Status::NotImplemented("Binary / string view");
+    auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_;
+    auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_;
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      return std::equal(left_values + i, left_values + i + length,
+                        right_values + i, right_values + i + length);
+    });
+    return Status::OK();
   }
 
   // Also matches LargeStringType
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index f42909a8e2..e0b83fbdbc 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -340,11 +340,10 @@ std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType
   return gen.MakeRandomArray(size, null_probability);
 }
 
-template <typename TypeClass>
+template <typename TypeClass, typename offset_type = typename TypeClass::offset_type>
 static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size,
                                                   int32_t min_length, int32_t max_length,
                                                   double null_probability) {
-  using offset_type = typename TypeClass::offset_type;
   using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
   using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
   using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
@@ -392,6 +391,13 @@ std::shared_ptr<Array> RandomArrayGenerator::LargeString(int64_t size, int32_t m
                                               null_probability);
 }
 
+std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t min_length,
+                                                        int32_t max_length,
+                                                        double null_probability) {
+  return GenerateBinaryArray<StringViewType, uint32_t>(this, size, min_length, max_length,
+                                                       null_probability);
+}
+
 std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats(int64_t size,
                                                                int64_t unique,
                                                                int32_t min_length,
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 00d50f9bc0..20774f309a 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -292,6 +292,19 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   std::shared_ptr<Array> String(int64_t size, int32_t min_length, int32_t max_length,
                                 double null_probability = 0);
 
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length,
+                                    double null_probability = 0);
+
   /// \brief Generate a random LargeStringArray
   ///
   /// \param[in] size the size of the array to generate