You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/29 21:03:06 UTC
[arrow] 01/02: Extract visitation of views owning buffers
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 10a07d0e669f3941b473a7824a5015b9d18ca78a
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Mon Nov 28 14:09:59 2022 -0500
Extract visitation of views owning buffers
---
cpp/src/arrow/array/array_binary_test.cc | 8 +
cpp/src/arrow/array/array_test.cc | 52 +++----
cpp/src/arrow/array/validate.cc | 242 ++++++++++++++++++-------------
cpp/src/arrow/buffer.h | 8 +
cpp/src/arrow/testing/random.cc | 10 ++
5 files changed, 190 insertions(+), 130 deletions(-)
diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index 92fc16f775..f21abf681f 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -389,6 +389,14 @@ TEST(StringViewArray, Validate) {
.ValidateFull(),
Ok());
+ // overlapping views and buffers are allowed
+ EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+ StringHeader(std::string_view{*buffer_s}.substr(5, 5)),
+ StringHeader(std::string_view{*buffer_s}.substr(9, 4))},
+ {buffer_s, SliceBuffer(buffer_s, 1, 1), SliceBuffer(buffer_s, 3, 6)})
+ .ValidateFull(),
+ Ok());
+
EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
// if a view points outside the buffers, that is invalid
StringHeader("from a galaxy far, far away"),
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index c14d4f21ac..36b274a99a 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -663,32 +663,32 @@ TEST_F(TestArray, TestMakeEmptyArray) {
FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
std::vector<int8_t> union_type_codes{7, 42};
- std::shared_ptr<DataType> types[] = {null(),
- boolean(),
- int8(),
- uint16(),
- int32(),
- uint64(),
- float64(),
- binary(),
- large_binary(),
- fixed_size_binary(3),
- decimal(16, 4),
- utf8(),
- large_utf8(),
- list(utf8()),
- list(int64()),
- large_list(large_utf8()),
- fixed_size_list(utf8(), 3),
- fixed_size_list(int64(), 4),
- dictionary(int32(), utf8()),
- struct_({field("a", utf8()), field("b", int32())}),
- sparse_union(union_fields1, union_type_codes),
- sparse_union(union_fields2, union_type_codes),
- dense_union(union_fields1, union_type_codes),
- dense_union(union_fields2, union_type_codes)};
-
- for (auto type : types) {
+ for (auto type : {null(),
+ boolean(),
+ int8(),
+ uint16(),
+ int32(),
+ uint64(),
+ float64(),
+ binary(),
+ binary_view(),
+ large_binary(),
+ fixed_size_binary(3),
+ decimal(16, 4),
+ utf8(),
+ utf8_view(),
+ large_utf8(),
+ list(utf8()),
+ list(int64()),
+ large_list(large_utf8()),
+ fixed_size_list(utf8(), 3),
+ fixed_size_list(int64(), 4),
+ dictionary(int32(), utf8()),
+ struct_({field("a", utf8()), field("b", int32())}),
+ sparse_union(union_fields1, union_type_codes),
+ sparse_union(union_fields2, union_type_codes),
+ dense_union(union_fields1, union_type_codes),
+ dense_union(union_fields2, union_type_codes)}) {
ARROW_SCOPED_TRACE("type = ", type->ToString());
ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
ASSERT_OK(array->ValidateFull());
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 53836efd97..2b83c84b28 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -30,13 +30,141 @@
#include "arrow/util/decimal.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging.h"
+#include "arrow/util/sort.h"
+#include "arrow/util/string.h"
#include "arrow/util/unreachable.h"
#include "arrow/util/utf8.h"
#include "arrow/visit_data_inline.h"
#include "arrow/visit_type_inline.h"
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
+
+/// visitor will be called once for each non-inlined StringHeader.
+/// It will be passed the index of each non-inlined StringHeader,
+/// as well as a `const shared_ptr<Buffer>&` of the buffer
+/// wherein the viewed memory resides, or nullptr if the viewed memory
+/// is not in a buffer managed by this array.
+template <typename Visitor>
+Status VisitNonInlinedViewsAndOwningBuffers(const ArrayData& data,
+ const Visitor& visitor) {
+ auto* headers = data.buffers[1]->data_as<StringHeader>();
+
+ static const std::shared_ptr<Buffer> kNullBuffer = nullptr;
+
+ if (data.buffers.size() == 2 ||
+ (data.buffers.size() == 3 && data.buffers.back() == nullptr)) {
+ // there are no character buffers, just visit a null buffer
+ for (int64_t i = 0; i < data.length; ++i) {
+ if (headers[i].IsInline()) continue;
+ RETURN_NOT_OK(visitor(i, kNullBuffer));
+ }
+ return Status::OK();
+ }
+
+ auto IsSubrangeOf = [](std::string_view super, StringHeader sub) {
+ return super.data() <= sub.data() &&
+ super.data() + super.size() >= sub.data() + sub.size();
+ };
+
+ std::vector<std::string_view> buffers;
+ std::vector<const std::shared_ptr<Buffer>*> owning_buffers;
+ for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
+ if (*it != nullptr) {
+ buffers.emplace_back(**it);
+ owning_buffers.push_back(&*it);
+ }
+ }
+
+ const int not_found = static_cast<int>(buffers.size());
+
+ auto DoVisit = [&](auto get_buffer) {
+ DCHECK(!buffers.empty());
+
+ // owning_buffers[not_found] points to the null placeholder
+ owning_buffers.push_back(&kNullBuffer);
+
+ std::string_view buffer_containing_previous_view = buffers.front();
+ int buffer_i = 0;
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ if (headers[i].IsInline()) continue;
+
+ if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, headers[i]))) {
+ // Fast path: for most string view arrays, we'll have runs
+ // of views into the same buffer.
+ } else {
+ buffer_i = get_buffer(headers[i]);
+ if (buffer_i != not_found) {
+ // if we didn't find a buffer which owns headers[i], we can hope
+ // that there was just one out of line string and check
+ // buffer_containing_previous_view next iteration
+ buffer_containing_previous_view = buffers[buffer_i];
+ }
+ }
+
+ RETURN_NOT_OK(visitor(i, *owning_buffers[buffer_i]));
+ }
+ return Status::OK();
+ };
+
+ // Simplest check for view-in-buffer: loop through buffers and check each one.
+ auto Linear = [&](StringHeader view) {
+ int i = 0;
+ for (std::string_view buffer : buffers) {
+ if (IsSubrangeOf(buffer, view)) return i;
+ ++i;
+ }
+ return not_found;
+ };
+
+ if (buffers.size() <= 32) {
+ // If there are few buffers to search through, sorting/binary search is not
+ // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
+ return DoVisit(Linear);
+ }
+
+ auto DataPtrLess = [](std::string_view l, std::string_view r) {
+ return l.data() < r.data();
+ };
+
+ {
+ auto sort_indices = ArgSort(buffers, DataPtrLess);
+ Permute(sort_indices, &buffers);
+ Permute(sort_indices, &owning_buffers);
+ }
+
+ bool non_overlapping =
+ buffers.end() !=
+ std::adjacent_find(buffers.begin(), buffers.end(),
+ [](std::string_view before, std::string_view after) {
+ return before.data() + before.size() <= after.data();
+ });
+ if (ARROW_PREDICT_FALSE(!non_overlapping)) {
+ // Using a binary search with overlapping buffers would not *uniquely* identify
+ // a potentially-containing buffer. Moreover this should be a fairly rare case
+ // so optimizing for it seems premature.
+ return DoVisit(Linear);
+ }
+
+ // More sophisticated check for view-in-buffer: binary search through the buffers.
+ return DoVisit([&](StringHeader view) {
+ // Find the first buffer whose data starts after the data in view-
+ // only buffers *before* this could contain view. Since we've additionally
+ // checked that the buffers do not overlap, only the buffer *immediately before*
+ // this could contain view.
+ int one_past_potential_super =
+ static_cast<int>(std::upper_bound(buffers.begin(), buffers.end(),
+ std::string_view{view}, DataPtrLess) -
+ buffers.begin());
+
+ if (one_past_potential_super == 0) return not_found;
+
+ int i = one_past_potential_super - 1;
+ if (IsSubrangeOf(buffers[i], view)) return i;
+
+ return not_found;
+ });
+}
namespace {
@@ -610,109 +738,16 @@ struct ValidateArrayImpl {
return Status::OK();
}
- auto* headers = data.GetValues<StringHeader>(1);
- std::string_view buffer_containing_previous_view;
-
- auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
- return super.data() <= sub.data() &&
- super.data() + super.size() >= sub.data() + sub.size();
- };
-
- std::vector<std::string_view> buffers;
- for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
- buffers.emplace_back(**it);
- }
-
- auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) {
- if constexpr (check_previous_buffer) {
- buffer_containing_previous_view = buffers.front();
- }
-
- for (int64_t i = 0; i < data.length; ++i) {
- if (headers[i].IsInline()) continue;
-
- std::string_view view{headers[i]};
-
- if constexpr (check_previous_buffer) {
- if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) {
- // Fast path: for most string view arrays, we'll have runs
- // of views into the same buffer.
- continue;
- }
- }
+ return VisitNonInlinedViewsAndOwningBuffers(
+ data, [&](int64_t i, const std::shared_ptr<Buffer>& owner) {
+ if (ARROW_PREDICT_TRUE(owner != nullptr)) return Status::OK();
- if (!in_a_buffer(view)) {
+ auto* ptr = data.buffers[1]->data_as<StringHeader>()[i].data();
return Status::Invalid(
- "String view at slot ", i, " @", (std::uintptr_t)view.data(),
+ "String view at slot ", i, " @",
+ arrow::HexEncode(reinterpret_cast<uint8_t*>(&ptr), sizeof(ptr)),
" views memory not resident in any buffer managed by the array");
- }
- }
- return Status::OK();
- };
-
- if (buffers.empty()) {
- // there are no character buffers; the only way this array
- // can be valid is if all views are inline
- return CheckViews([](std::string_view) { return std::false_type{}; },
- /*check_previous_buffer=*/std::false_type{});
- }
-
- // Simplest check for view-in-buffer: loop through buffers and check each one.
- auto Linear = [&](std::string_view view) {
- for (std::string_view buffer : buffers) {
- if (IsSubrangeOf(buffer, view)) {
- buffer_containing_previous_view = buffer;
- return true;
- }
- }
- return false;
- };
-
- if (buffers.size() <= 32) {
- // If there are few buffers to search through, sorting/binary search is not
- // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
- return CheckViews(Linear,
- /*check_previous_buffer=*/std::true_type{});
- }
-
- auto DataPtrLess = [](std::string_view l, std::string_view r) {
- return l.data() < r.data();
- };
-
- std::sort(buffers.begin(), buffers.end(), DataPtrLess);
- bool non_overlapping =
- buffers.end() !=
- std::adjacent_find(buffers.begin(), buffers.end(),
- [](std::string_view before, std::string_view after) {
- return before.data() + before.size() <= after.data();
- });
- if (ARROW_PREDICT_FALSE(!non_overlapping)) {
- // Using a binary search with overlapping buffers would not *uniquely* identify
- // a potentially-containing buffer. Moreover this should be a fairly rare case
- // so optimizing for it seems premature.
- return CheckViews(Linear,
- /*check_previous_buffer=*/std::true_type{});
- }
-
- // More sophisticated check for view-in-buffer: binary search through the buffers.
- return CheckViews(
- [&](std::string_view view) {
- // Find the first buffer whose data starts after the data in view-
- // only buffers *before* this could contain view. Since we've additionally
- // checked that the buffers do not overlap, only the buffer *immediately before*
- // this could contain view.
- auto one_past_potential_super =
- std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess);
-
- if (one_past_potential_super == buffers.begin()) return false;
-
- auto potential_super = *(one_past_potential_super - 1);
- if (!IsSubrangeOf(potential_super, view)) return false;
-
- buffer_containing_previous_view = potential_super;
- return true;
- },
- /*check_previous_buffer=*/std::true_type{});
+ });
}
template <typename ListType>
@@ -863,5 +898,4 @@ Status ValidateUTF8(const ArrayData& data) {
ARROW_EXPORT
Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
-} // namespace internal
-} // namespace arrow
+} // namespace arrow::internal
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 584a33fbde..73a07755b1 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -182,6 +182,10 @@ class ARROW_EXPORT Buffer {
#endif
return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
}
+ template <typename T>
+ const T* data_as() const {
+ return reinterpret_cast<const T*>(data());
+ }
/// \brief Return a writable pointer to the buffer's data
///
@@ -199,6 +203,10 @@ class ARROW_EXPORT Buffer {
return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
: NULLPTR;
}
+ template <typename T>
+ T* mutable_data_as() {
+ return reinterpret_cast<T*>(mutable_data());
+ }
/// \brief Return the device address of the buffer's data
uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index e0b83fbdbc..a1889884e3 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -782,6 +782,16 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
->View(field.type());
}
+ case Type::type::STRING_VIEW:
+ case Type::type::BINARY_VIEW: {
+ const auto min_length =
+ GetMetadata<int32_t>(field.metadata().get(), "min_length", 0);
+ const auto max_length =
+ GetMetadata<int32_t>(field.metadata().get(), "max_length", 20);
+ return *StringView(length, min_length, max_length, null_probability)
+ ->View(field.type());
+ }
+
case Type::type::DECIMAL128:
return Decimal128(field.type(), length, null_probability);