You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/29 21:03:06 UTC

[arrow] 01/02: Extract visitation of views owning buffers

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 10a07d0e669f3941b473a7824a5015b9d18ca78a
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Mon Nov 28 14:09:59 2022 -0500

    Extract visitation of views owning buffers
---
 cpp/src/arrow/array/array_binary_test.cc |   8 +
 cpp/src/arrow/array/array_test.cc        |  52 +++----
 cpp/src/arrow/array/validate.cc          | 242 ++++++++++++++++++-------------
 cpp/src/arrow/buffer.h                   |   8 +
 cpp/src/arrow/testing/random.cc          |  10 ++
 5 files changed, 190 insertions(+), 130 deletions(-)

diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 92fc16f775..f21abf681f 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -389,6 +389,14 @@ TEST(StringViewArray, Validate) {
                   .ValidateFull(),
               Ok());
 
+  // overlapping views and buffers are allowed
+  EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+                         StringHeader(std::string_view{*buffer_s}.substr(5, 5)),
+                         StringHeader(std::string_view{*buffer_s}.substr(9, 4))},
+                        {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)})
+                  .ValidateFull(),
+              Ok());
+
   EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
                          // if a view points outside the buffers, that is invalid
                          StringHeader("from a galaxy far, far away"),
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index c14d4f21ac..36b274a99a 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -663,32 +663,32 @@ TEST_F(TestArray, TestMakeEmptyArray) {
   FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
   std::vector<int8_t> union_type_codes{7, 42};
 
-  std::shared_ptr<DataType> types[] = {null(),
-                                       boolean(),
-                                       int8(),
-                                       uint16(),
-                                       int32(),
-                                       uint64(),
-                                       float64(),
-                                       binary(),
-                                       large_binary(),
-                                       fixed_size_binary(3),
-                                       decimal(16, 4),
-                                       utf8(),
-                                       large_utf8(),
-                                       list(utf8()),
-                                       list(int64()),
-                                       large_list(large_utf8()),
-                                       fixed_size_list(utf8(), 3),
-                                       fixed_size_list(int64(), 4),
-                                       dictionary(int32(), utf8()),
-                                       struct_({field("a", utf8()), field("b", int32())}),
-                                       sparse_union(union_fields1, union_type_codes),
-                                       sparse_union(union_fields2, union_type_codes),
-                                       dense_union(union_fields1, union_type_codes),
-                                       dense_union(union_fields2, union_type_codes)};
-
-  for (auto type : types) {
+  for (auto type : {null(),
+                    boolean(),
+                    int8(),
+                    uint16(),
+                    int32(),
+                    uint64(),
+                    float64(),
+                    binary(),
+                    binary_view(),
+                    large_binary(),
+                    fixed_size_binary(3),
+                    decimal(16, 4),
+                    utf8(),
+                    utf8_view(),
+                    large_utf8(),
+                    list(utf8()),
+                    list(int64()),
+                    large_list(large_utf8()),
+                    fixed_size_list(utf8(), 3),
+                    fixed_size_list(int64(), 4),
+                    dictionary(int32(), utf8()),
+                    struct_({field("a", utf8()), field("b", int32())}),
+                    sparse_union(union_fields1, union_type_codes),
+                    sparse_union(union_fields2, union_type_codes),
+                    dense_union(union_fields1, union_type_codes),
+                    dense_union(union_fields2, union_type_codes)}) {
     ARROW_SCOPED_TRACE("type = ", type->ToString());
     ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
     ASSERT_OK(array->ValidateFull());
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 53836efd97..2b83c84b28 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -30,13 +30,141 @@
 #include "arrow/util/decimal.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/sort.h"
+#include "arrow/util/string.h"
 #include "arrow/util/unreachable.h"
 #include "arrow/util/utf8.h"
 #include "arrow/visit_data_inline.h"
 #include "arrow/visit_type_inline.h"
 
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
+
+/// visitor will be called once for each non-inlined StringHeader.
+/// It will be passed the index of each non-inlined StringHeader,
+/// as well as a `const shared_ptr<Buffer>&` of the buffer
+/// wherein the viewed memory resides, or nullptr if the viewed memory
+/// is not in a buffer managed by this array.
+template <typename Visitor>
+Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data,
+                                            const Visitor& visitor) {
+  auto* headers = data.buffers[1]->data_as<StringHeader>();
+
+  static const std::shared_ptr<Buffer> kNullBuffer = nullptr;
+
+  if (data.buffers.size() == 2 ||
+      (data.buffers.size() == 3 && data.buffers.back() == nullptr)) {
+    // there are no character buffers, just visit a null buffer
+    for (int64_t i = 0; i < data.length; ++i) {
+      if (headers[i].IsInline()) continue;
+      RETURN_NOT_OK(visitor(i, kNullBuffer));
+    }
+    return Status::OK();
+  }
+
+  auto IsSubrangeOf = [](std::string_view super, StringHeader sub) {
+    return super.data() <= sub.data() &&
+           super.data() + super.size() >= sub.data() + sub.size();
+  };
+
+  std::vector<std::string_view> buffers;
+  std::vector<const std::shared_ptr<Buffer>*> owning_buffers;
+  for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
+    if (*it != nullptr) {
+      buffers.emplace_back(**it);
+      owning_buffers.push_back(&*it);
+    }
+  }
+
+  const int not_found = static_cast<int>(buffers.size());
+
+  auto DoVisit = [&](auto get_buffer) {
+    DCHECK(!buffers.empty());
+
+    // owning_buffers[not_found] points to the null placeholder
+    owning_buffers.push_back(&kNullBuffer);
+
+    std::string_view buffer_containing_previous_view = buffers.front();
+    int buffer_i = 0;
+
+    for (int64_t i = 0; i < data.length; ++i) {
+      if (headers[i].IsInline()) continue;
+
+      if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) {
+        // Fast path: for most string view arrays, we'll have runs
+        // of views into the same buffer.
+      } else {
+        buffer_i = get_buffer(headers[i]);
+        if (buffer_i != not_found) {
+          // if we didn't find a buffer which owns headers[i], we can hope
+          // that there was just one out of line string and check
+          // buffer_containing_previous_view next iteration
+          buffer_containing_previous_view = buffers[buffer_i];
+        }
+      }
+
+      RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i]));
+    }
+    return Status::OK();
+  };
+
+  // Simplest check for view-in-buffer: loop through buffers and check each one.
+  auto Linear = [&](StringHeader view) {
+    int i = 0;
+    for (std::string_view buffer : buffers) {
+      if (IsSubrangeOf(buffer, view)) return i;
+      ++i;
+    }
+    return not_found;
+  };
+
+  if (buffers.size() <= 32) {
+    // If there are few buffers to search through, sorting/binary search is not
+    // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
+    return DoVisit(Linear);
+  }
+
+  auto DataPtrLess = [](std::string_view l, std::string_view r) {
+    return l.data() < r.data();
+  };
+
+  {
+    auto sort_indices = ArgSort(buffers, DataPtrLess);
+    Permute(sort_indices, &buffers);
+    Permute(sort_indices, &owning_buffers);
+  }
+
+  bool non_overlapping =
+      buffers.end() !=
+      std::adjacent_find(buffers.begin(), buffers.end(),
+                         [](std::string_view before, std::string_view after) {
+                           return before.data() + before.size() <= after.data();
+                         });
+  if (ARROW_PREDICT_FALSE(!non_overlapping)) {
+    // Using a binary search with overlapping buffers would not *uniquely* identify
+    // a potentially-containing buffer. Moreover this should be a fairly rare case
+    // so optimizing for it seems premature.
+    return DoVisit(Linear);
+  }
+
+  // More sophisticated check for view-in-buffer: binary search through the buffers.
+  return DoVisit([&](StringHeader view) {
+    // Find the first buffer whose data starts after the data in view-
+    // only buffers *before* this could contain view. Since we've additionally
+    // checked that the buffers do not overlap, only the buffer *immediately before*
+    // this could contain view.
+    int one_past_potential_super =
+        static_cast<int>(std::upper_bound(buffers.begin(), buffers.end(),
+                                          std::string_view{view}, DataPtrLess) -
+                         buffers.begin());
+
+    if (one_past_potential_super == 0) return not_found;
+
+    int i = one_past_potential_super - 1;
+    if (IsSubrangeOf(buffers[i], view)) return i;
+
+    return not_found;
+  });
+}
 
 namespace {
 
@@ -610,109 +738,16 @@ struct ValidateArrayImpl {
       return Status::OK();
     }
 
-    auto* headers = data.GetValues<StringHeader>(1);
-    std::string_view buffer_containing_previous_view;
-
-    auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
-      return super.data() <= sub.data() &&
-             super.data() + super.size() >= sub.data() + sub.size();
-    };
-
-    std::vector<std::string_view> buffers;
-    for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
-      buffers.emplace_back(**it);
-    }
-
-    auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) {
-      if constexpr (check_previous_buffer) {
-        buffer_containing_previous_view = buffers.front();
-      }
-
-      for (int64_t i = 0; i < data.length; ++i) {
-        if (headers[i].IsInline()) continue;
-
-        std::string_view view{headers[i]};
-
-        if constexpr (check_previous_buffer) {
-          if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) {
-            // Fast path: for most string view arrays, we'll have runs
-            // of views into the same buffer.
-            continue;
-          }
-        }
+    return VisitNonInlinedViewsAndOwningBuffers(
+        data, [&](int64_t i, const std::shared_ptr<Buffer>& owner) {
+          if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK();
 
-        if (!in_a_buffer(view)) {
+          auto* ptr = data.buffers[1]->data_as<StringHeader>()[i].data();
           return Status::Invalid(
-              "String view at slot ", i, " @", (std::uintptr_t)view.data(),
+              "String view at slot ", i, " @",
+              arrow::HexEncode(reinterpret_cast<uint8_t*>(&ptr), sizeof(ptr)),
               " views memory not resident in any buffer managed by the array");
-        }
-      }
-      return Status::OK();
-    };
-
-    if (buffers.empty()) {
-      // there are no character buffers; the only way this array
-      // can be valid is if all views are inline
-      return CheckViews([](std::string_view) { return std::false_type{}; },
-                        /*check_previous_buffer=*/std::false_type{});
-    }
-
-    // Simplest check for view-in-buffer: loop through buffers and check each one.
-    auto Linear = [&](std::string_view view) {
-      for (std::string_view buffer : buffers) {
-        if (IsSubrangeOf(buffer, view)) {
-          buffer_containing_previous_view = buffer;
-          return true;
-        }
-      }
-      return false;
-    };
-
-    if (buffers.size() <= 32) {
-      // If there are few buffers to search through, sorting/binary search is not
-      // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
-      return CheckViews(Linear,
-                        /*check_previous_buffer=*/std::true_type{});
-    }
-
-    auto DataPtrLess = [](std::string_view l, std::string_view r) {
-      return l.data() < r.data();
-    };
-
-    std::sort(buffers.begin(), buffers.end(), DataPtrLess);
-    bool non_overlapping =
-        buffers.end() !=
-        std::adjacent_find(buffers.begin(), buffers.end(),
-                           [](std::string_view before, std::string_view after) {
-                             return before.data() + before.size() <= after.data();
-                           });
-    if (ARROW_PREDICT_FALSE(!non_overlapping)) {
-      // Using a binary search with overlapping buffers would not *uniquely* identify
-      // a potentially-containing buffer. Moreover this should be a fairly rare case
-      // so optimizing for it seems premature.
-      return CheckViews(Linear,
-                        /*check_previous_buffer=*/std::true_type{});
-    }
-
-    // More sophisticated check for view-in-buffer: binary search through the buffers.
-    return CheckViews(
-        [&](std::string_view view) {
-          // Find the first buffer whose data starts after the data in view-
-          // only buffers *before* this could contain view. Since we've additionally
-          // checked that the buffers do not overlap, only the buffer *immediately before*
-          // this could contain view.
-          auto one_past_potential_super =
-              std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess);
-
-          if (one_past_potential_super == buffers.begin()) return false;
-
-          auto potential_super = *(one_past_potential_super - 1);
-          if (!IsSubrangeOf(potential_super, view)) return false;
-
-          buffer_containing_previous_view = potential_super;
-          return true;
-        },
-        /*check_previous_buffer=*/std::true_type{});
+        });
   }
 
   template <typename ListType>
@@ -863,5 +898,4 @@ Status ValidateUTF8(const ArrayData& data) {
 ARROW_EXPORT
 Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
 
-}  // namespace internal
-}  // namespace arrow
+}  // namespace arrow::internal
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 584a33fbde..73a07755b1 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -182,6 +182,10 @@ class ARROW_EXPORT Buffer {
 #endif
     return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
   }
+  template <typename T>
+  const T* data_as() const {
+    return reinterpret_cast<const T*>(data());
+  }
 
   /// \brief Return a writable pointer to the buffer's data
   ///
@@ -199,6 +203,10 @@ class ARROW_EXPORT Buffer {
     return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
                                                       : NULLPTR;
   }
+  template <typename T>
+  T* mutable_data_as() {
+    return reinterpret_cast<T*>(mutable_data());
+  }
 
   /// \brief Return the device address of the buffer's data
   uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index e0b83fbdbc..a1889884e3 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -782,6 +782,16 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
                   ->View(field.type());
     }
 
+    case Type::type::STRING_VIEW:
+    case Type::type::BINARY_VIEW: {
+      const auto min_length =
+          GetMetadata<int32_t>(field.metadata().get(), "min_length", 0);
+      const auto max_length =
+          GetMetadata<int32_t>(field.metadata().get(), "max_length", 20);
+      return *StringView(length, min_length, max_length, null_probability)
+                  ->View(field.type());
+    }
+
     case Type::type::DECIMAL128:
       return Decimal128(field.type(), length, null_probability);