You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:55 UTC
[arrow] 11/15: Added validation for StringView arrays
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 7474342cf2214d88778dc33526013ec82537636a
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 13:07:57 2022 -0500
Added validation for StringView arrays
---
cpp/src/arrow/array/array_base.cc | 4 +-
cpp/src/arrow/array/array_binary.h | 38 +++++-
cpp/src/arrow/array/array_binary_test.cc | 67 +++++++---
cpp/src/arrow/array/array_test.cc | 4 +-
cpp/src/arrow/array/builder_base.cc | 17 ++-
cpp/src/arrow/array/builder_binary.h | 4 +-
cpp/src/arrow/array/util.cc | 28 +++-
cpp/src/arrow/array/validate.cc | 147 +++++++++++++++++++--
cpp/src/arrow/compare.cc | 8 +-
.../arrow/compute/kernels/scalar_nested_test.cc | 3 +
.../arrow/compute/kernels/scalar_string_test.cc | 10 +-
cpp/src/arrow/compute/kernels/vector_hash.cc | 94 ++++---------
cpp/src/arrow/scalar.cc | 20 +--
cpp/src/arrow/scalar.h | 18 ++-
cpp/src/arrow/testing/gtest_util.h | 6 +-
cpp/src/arrow/type.h | 11 +-
16 files changed, 331 insertions(+), 148 deletions(-)
diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
index de9ab2e985..f4f860ca95 100644
--- a/cpp/src/arrow/array/array_base.cc
+++ b/cpp/src/arrow/array/array_base.cc
@@ -83,7 +83,9 @@ struct ScalarFromArraySlotImpl {
}
Status Visit(const BinaryViewArray& a) {
- return Status::NotImplemented("ScalarFromArraySlot -> BinaryView");
+ StringHeader header = a.Value(index_);
+ std::string_view view{header};
+ return Finish(std::string{view});
}
Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
index 03ee77fab8..1c8947dde3 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -230,16 +230,37 @@ class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
- BinaryViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
- const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ /// By default, ValidateFull() will check each view in a BinaryViewArray or
+ /// StringViewArray to ensure it references a memory range owned by one of the array's
+ /// buffers.
+ ///
+ /// If the last character buffer is null, ValidateFull will skip this step. Use this
+ /// for arrays which view memory elsewhere.
+ static BufferVector DoNotValidateViews(BufferVector char_buffers) {
+ char_buffers.push_back(NULLPTR);
+ return char_buffers;
+ }
+
+ static bool OptedOutOfViewValidation(const ArrayData& data) {
+ return data.buffers.back() == NULLPTR;
+ }
+ bool OptedOutOfViewValidation() const { return OptedOutOfViewValidation(*data_); }
+
+ BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector char_buffers,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {}
+ : PrimitiveArray(binary_view(), length, std::move(data), std::move(null_bitmap),
+ null_count, offset) {
+ for (auto& char_buffer : char_buffers) {
+ data_->buffers.push_back(std::move(char_buffer));
+ }
+ }
const StringHeader* raw_values() const {
return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
}
- StringHeader Value(int64_t i) const { return raw_values()[i]; }
+ const StringHeader& Value(int64_t i) const { return raw_values()[i]; }
// For API compatibility with BinaryArray etc.
std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); }
@@ -264,10 +285,13 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray {
explicit StringViewArray(const std::shared_ptr<ArrayData>& data);
- StringViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
- const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ StringViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector char_buffers,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {}
+ : BinaryViewArray(length, std::move(data), std::move(char_buffers),
+ std::move(null_bitmap), null_count, offset) {
+ data_->type = utf8_view();
+ }
/// \brief Validate that this array contains only valid UTF8 entries
///
diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index c9f1b1cfab..92fc16f775 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -32,6 +32,7 @@
#include "arrow/status.h"
#include "arrow/testing/builder.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -365,38 +366,73 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets();
TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
+TEST(StringViewArray, Validate) {
+ auto MakeArray = [](std::vector<StringHeader> headers, BufferVector char_buffers) {
+ auto length = static_cast<int64_t>(headers.size());
+ return StringViewArray(length, Buffer::Wrap(std::move(headers)),
+ std::move(char_buffers));
+ };
+
+ // empty array is valid
+ EXPECT_THAT(MakeArray({}, {}).ValidateFull(), Ok());
+
+ // inline views need not have a corresponding buffer
+ EXPECT_THAT(MakeArray({"hello", "world", "inline me"}, {}).ValidateFull(), Ok());
+
+ auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)");
+ auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy");
+
+ // non-inline views are expected to reside in a buffer managed by the array
+ EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+ StringHeader(std::string_view{*buffer_y})},
+ {buffer_s, buffer_y})
+ .ValidateFull(),
+ Ok());
+
+ EXPECT_THAT(MakeArray({StringHeader(std::string_view{*buffer_s}),
+ // if a view points outside the buffers, that is invalid
+ StringHeader("from a galaxy far, far away"),
+ StringHeader(std::string_view{*buffer_y})},
+ {buffer_s, buffer_y})
+ .ValidateFull(),
+ Raises(StatusCode::Invalid));
+
+ // ... unless specifically overridden
+ EXPECT_THAT(
+ MakeArray({"from a galaxy far, far away"}, StringViewArray::DoNotValidateViews({}))
+ .ValidateFull(),
+ Ok());
+}
+
template <typename T>
class TestUTF8Array : public ::testing::Test {
public:
using TypeClass = T;
- using offset_type = typename TypeClass::offset_type;
using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
- Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
- std::string_view data, int64_t offset = 0) {
- ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared<Buffer>(data),
- /*null_bitmap=*/nullptr, /*null_count=*/0, offset);
- return arr.ValidateUTF8();
+ Status ValidateUTF8(const Array& arr) {
+ return checked_cast<const ArrayType&>(arr).ValidateUTF8();
}
- Status ValidateUTF8(const std::string& json) {
- auto ty = TypeTraits<T>::type_singleton();
- auto arr = ArrayFromJSON(ty, json);
- return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
+ Status ValidateUTF8(std::vector<std::string> values) {
+ std::shared_ptr<Array> arr;
+ ArrayFromVector<T, std::string>(values, &arr);
+ return ValidateUTF8(*arr);
}
void TestValidateUTF8() {
- ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
- ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff
+ ASSERT_OK(ValidateUTF8(*ArrayFromJSON(TypeTraits<T>::type_singleton(),
+ R"(["Voix", "ambiguë", "d’un", "cœur"])")));
+ ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff
- ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
+ ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"}));
// More tests in TestValidateData() above
// (ValidateFull() calls ValidateUTF8() internally)
}
};
-TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes);
+TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes);
TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }
@@ -908,9 +944,6 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
std::shared_ptr<DataType> type_;
};
-using BinaryAndBin = ::testing::Types<BinaryType, LargeBinaryType, StringType,
- LargeStringType, BinaryViewType, StringViewType>;
-
TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes);
TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index d4ad1578b7..c14d4f21ac 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -544,12 +544,14 @@ static ScalarVector GetScalars() {
std::make_shared<DurationScalar>(60, duration(TimeUnit::SECOND)),
std::make_shared<BinaryScalar>(hello),
std::make_shared<LargeBinaryScalar>(hello),
+ std::make_shared<BinaryViewScalar>(hello),
std::make_shared<FixedSizeBinaryScalar>(
hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
std::make_shared<StringScalar>(hello),
std::make_shared<LargeStringScalar>(hello),
+ std::make_shared<StringViewScalar>(hello),
std::make_shared<ListScalar>(ArrayFromJSON(int8(), "[1, 2, 3]")),
ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"),
std::make_shared<LargeListScalar>(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")),
@@ -594,7 +596,7 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
ASSERT_EQ(array->null_count(), 0);
// test case for ARROW-13321
- for (int64_t i : std::vector<int64_t>{0, length / 2, length - 1}) {
+ for (int64_t i : {int64_t{0}, length / 2, length - 1}) {
ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i));
AssertScalarsEqual(*s, *scalar, /*verbose=*/true);
}
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index e9d5fb44ac..3b2ee570f9 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -103,10 +103,7 @@ namespace {
struct AppendScalarImpl {
template <typename T>
- enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
- is_fixed_size_binary_type<T>::value,
- Status>
- Visit(const T&) {
+ Status HandleFixedWidth(const T&) {
auto builder = checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
@@ -125,7 +122,17 @@ struct AppendScalarImpl {
}
template <typename T>
- enable_if_base_binary<T, Status> Visit(const T&) {
+ enable_if_t<has_c_type<T>::value, Status> Visit(const T& t) {
+ return HandleFixedWidth(t);
+ }
+
+ Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); }
+ Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); }
+ Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); }
+
+ template <typename T>
+ enable_if_t<is_binary_like_type<T>::value || is_string_like_type<T>::value, Status>
+ Visit(const T&) {
int64_t data_size = 0;
for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
raw++) {
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index b9d926cb16..30ab4b9d4a 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -576,7 +576,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
Status Append(StringHeader value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
- UnsafeAppendToBitmap(true);
return Status::OK();
}
@@ -591,7 +590,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
value = data_heap_builder_.UnsafeAppend(value, length);
}
UnsafeAppend(StringHeader(value, length));
- UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const char* value, int64_t length) {
@@ -653,7 +651,7 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
}
void UnsafeAppendEmptyValue() {
- data_builder_.UnsafeAppend(StringHeader(""));
+ data_builder_.UnsafeAppend(StringHeader());
UnsafeAppendToBitmap(true);
}
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index ac9d76d469..fe5a0dd575 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -355,6 +355,10 @@ class NullArrayFactory {
return MaxOf(sizeof(typename T::offset_type) * (length_ + 1));
}
+ Status Visit(const BinaryViewType& type) {
+ return MaxOf(sizeof(StringHeader) * length_);
+ }
+
Status Visit(const FixedSizeListType& type) {
return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_));
}
@@ -463,6 +467,11 @@ class NullArrayFactory {
return Status::OK();
}
+ Status Visit(const BinaryViewType&) {
+ out_->buffers.resize(2, buffer_);
+ return Status::OK();
+ }
+
template <typename T>
enable_if_var_size_list<T, Status> Visit(const T& type) {
out_->buffers.resize(2, buffer_);
@@ -599,14 +608,27 @@ class RepeatedArrayFactory {
RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer));
auto size = static_cast<typename T::offset_type>(value->size());
RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer));
- out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(length_, offsets_buffer,
- values_buffer);
+ out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(
+ length_, std::move(offsets_buffer), std::move(values_buffer));
return Status::OK();
}
template <typename T>
enable_if_binary_view_like<T, Status> Visit(const T&) {
- return Status::NotImplemented("binary / string view");
+ const std::shared_ptr<Buffer>& value =
+ checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
+
+ StringHeader header{std::string_view{*value}};
+ std::shared_ptr<Buffer> header_buffer;
+ RETURN_NOT_OK(CreateBufferOf(&header, sizeof(header), &header_buffer));
+
+ BufferVector char_buffers;
+ if (!header.IsInline()) {
+ char_buffers.push_back(value);
+ }
+ out_ = std::make_shared<typename TypeTraits<T>::ArrayType>(
+ length_, std::move(header_buffer), std::move(char_buffers));
+ return Status::OK();
}
template <typename T>
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index cddb086005..53d74ba148 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -30,6 +30,7 @@
#include "arrow/util/decimal.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging.h"
+#include "arrow/util/unreachable.h"
#include "arrow/util/utf8.h"
#include "arrow/visit_data_inline.h"
#include "arrow/visit_type_inline.h"
@@ -42,10 +43,7 @@ namespace {
struct UTF8DataValidator {
const ArrayData& data;
- Status Visit(const DataType&) {
- // Default, should be unreachable
- return Status::NotImplemented("");
- }
+ Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); }
Status Visit(const StringViewType&) {
util::InitializeUTF8();
@@ -86,10 +84,7 @@ struct BoundsChecker {
int64_t min_value;
int64_t max_value;
- Status Visit(const DataType&) {
- // Default, should be unreachable
- return Status::NotImplemented("");
- }
+ Status Visit(const DataType&) { Unreachable("bounds checking of non integer type"); }
template <typename IntegerType>
enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
@@ -260,9 +255,7 @@ struct ValidateArrayImpl {
Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const BinaryViewType& type) {
- return Status::NotImplemented("binary / string view");
- }
+ Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); }
Status Visit(const ListType& type) { return ValidateListLike(type); }
@@ -455,7 +448,14 @@ struct ValidateArrayImpl {
return Status::Invalid("Array length is negative");
}
- if (data.buffers.size() != layout.buffers.size()) {
+ if (layout.variadic_spec) {
+ if (data.buffers.size() < layout.buffers.size()) {
+ return Status::Invalid("Expected at least ", layout.buffers.size(),
+ " buffers in array "
+ "of type ",
+ type.ToString(), ", got ", data.buffers.size());
+ }
+ } else if (data.buffers.size() != layout.buffers.size()) {
return Status::Invalid("Expected ", layout.buffers.size(),
" buffers in array "
"of type ",
@@ -471,7 +471,9 @@ struct ValidateArrayImpl {
for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
const auto& buffer = data.buffers[i];
- const auto& spec = layout.buffers[i];
+ const auto& spec = i < static_cast<int>(layout.buffers.size())
+ ? layout.buffers[i]
+ : *layout.variadic_spec;
if (buffer == nullptr) {
continue;
@@ -594,6 +596,125 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ Status ValidateBinaryView(const BinaryViewType& type) {
+ int64_t headers_byte_size = data.buffers[1]->size();
+ int64_t required_headers = data.length + data.offset;
+ if (static_cast<int64_t>(headers_byte_size / sizeof(StringHeader)) <
+ required_headers) {
+ return Status::Invalid("Header buffer size (bytes): ", headers_byte_size,
+ " isn't large enough for length: ", data.length,
+ " and offset: ", data.offset);
+ }
+
+ if (!full_validation || BinaryViewArray::OptedOutOfViewValidation(data)) {
+ return Status::OK();
+ }
+
+ auto* headers = data.GetValues<StringHeader>(1);
+ std::string_view buffer_containing_previous_view;
+
+ auto IsSubrangeOf = [](std::string_view super, std::string_view sub) {
+ return super.data() <= sub.data() &&
+ super.data() + super.size() <= sub.data() + sub.size();
+ };
+
+ std::vector<std::string_view> buffers;
+ for (auto it = data.buffers.begin() + 2; it != data.buffers.end(); ++it) {
+ buffers.emplace_back(**it);
+ }
+
+ auto CheckViews = [&](auto in_a_buffer, auto check_previous_buffer) {
+ if constexpr (check_previous_buffer) {
+ buffer_containing_previous_view = buffers.front();
+ }
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ if (headers[i].IsInline()) continue;
+
+ std::string_view view{headers[i]};
+
+ if constexpr (check_previous_buffer) {
+ if (ARROW_PREDICT_TRUE(IsSubrangeOf(buffer_containing_previous_view, view))) {
+ // Fast path: for most string view arrays, we'll have runs
+ // of views into the same buffer.
+ continue;
+ }
+ }
+
+ if (!in_a_buffer(view)) {
+ return Status::Invalid(
+ "String view at slot ", i,
+ " views memory not resident in any buffer managed by the array");
+ }
+ }
+ return Status::OK();
+ };
+
+ if (buffers.empty()) {
+ // there are no character buffers; the only way this array
+ // can be valid is if all views are inline
+ return CheckViews([](std::string_view) { return std::false_type{}; },
+ /*check_previous_buffer=*/std::false_type{});
+ }
+
+ // Simplest check for view-in-buffer: loop through buffers and check each one.
+ auto Linear = [&](std::string_view view) {
+ for (std::string_view buffer : buffers) {
+ if (IsSubrangeOf(buffer, view)) {
+ buffer_containing_previous_view = buffer;
+ return true;
+ }
+ }
+ return false;
+ };
+
+ if (buffers.size() <= 32) {
+ // If there are few buffers to search through, sorting/binary search is not
+ // worthwhile. TODO(bkietz) benchmark this and get a less magic number here.
+ return CheckViews(Linear,
+ /*check_previous_buffer=*/std::true_type{});
+ }
+
+ auto DataPtrLess = [](std::string_view l, std::string_view r) {
+ return l.data() < r.data();
+ };
+
+ std::sort(buffers.begin(), buffers.end(), DataPtrLess);
+ bool non_overlapping =
+ buffers.end() !=
+ std::adjacent_find(buffers.begin(), buffers.end(),
+ [](std::string_view before, std::string_view after) {
+ return before.data() + before.size() <= after.data();
+ });
+ if (ARROW_PREDICT_FALSE(!non_overlapping)) {
+ // Using a binary search with overlapping buffers would not *uniquely* identify
+ // a potentially-containing buffer. Moreover this should be a fairly rare case
+ // so optimizing for it seems premature.
+ return CheckViews(Linear,
+ /*check_previous_buffer=*/std::true_type{});
+ }
+
+ // More sophisticated check for view-in-buffer: binary search through the buffers.
+ return CheckViews(
+ [&](std::string_view view) {
+ // Find the first buffer whose data starts after the data in view-
+ // only buffers *before* this could contain view. Since we've additionally
+ // checked that the buffers do not overlap, only the buffer *immediately before*
+ // this could contain view.
+ auto one_past_potential_super =
+ std::upper_bound(buffers.begin(), buffers.end(), view, DataPtrLess);
+
+ if (one_past_potential_super == buffers.begin()) return false;
+
+ auto potential_super = *(one_past_potential_super - 1);
+ if (!IsSubrangeOf(potential_super, view)) return false;
+
+ buffer_containing_previous_view = potential_super;
+ return true;
+ },
+ /*check_previous_buffer=*/std::true_type{});
+ }
+
template <typename ListType>
Status ValidateListLike(const ListType& type) {
const ArrayData& values = *data.child_data[0];
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 8ccc645046..68250f0288 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -727,19 +727,13 @@ class ScalarEqualsVisitor {
Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
template <typename T>
- typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
+ enable_if_t<std::is_base_of<BaseBinaryScalar, T>::value, Status>
Visit(const T& left) {
const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
result_ = internal::SharedPtrEquals(left.value, right.value);
return Status::OK();
}
- Status Visit(const BinaryViewScalar& left) {
- const auto& right = checked_cast<const BinaryViewScalar&>(right_);
- result_ = left.value == right.value;
- return Status::OK();
- }
-
Status Visit(const Decimal128Scalar& left) {
const auto& right = checked_cast<const Decimal128Scalar&>(right_);
result_ = left.value == right.value;
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index 744f188908..523e20c4a7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -796,6 +796,9 @@ TEST(MakeStruct, Array) {
EXPECT_THAT(MakeStructor({i32, str}, {"i", "s"}),
ResultWith(Datum(*StructArray::Make({i32, str}, field_names))));
+ EXPECT_THAT(*MakeScalar("aa"), testing::Eq(StringScalar("aa")));
+ EXPECT_EQ(*MakeStructor({i32, MakeScalar("aa")}, {"i", "s"})->type(),
+ StructType({field("i", i32->type()), field("s", str->type())}));
// Scalars are broadcast to the length of the arrays
EXPECT_THAT(MakeStructor({i32, MakeScalar("aa")}, {"i", "s"}),
ResultWith(Datum(*StructArray::Make({i32, str}, field_names))));
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 2498e7f562..b390a36b4c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -47,7 +47,6 @@ namespace compute {
template <typename TestType>
class BaseTestStringKernels : public ::testing::Test {
protected:
- using OffsetType = typename TypeTraits<TestType>::OffsetType;
using ScalarType = typename TypeTraits<TestType>::ScalarType;
void CheckUnary(std::string func_name, std::string json_input,
@@ -97,7 +96,14 @@ class BaseTestStringKernels : public ::testing::Test {
}
std::shared_ptr<DataType> offset_type() {
- return TypeTraits<OffsetType>::type_singleton();
+ if constexpr (is_binary_view_like_type<TestType>::value) {
+ // Views do not have offsets, but Functions like binary_length
+ // will return the length as uint32
+ return uint32();
+ } else {
+ using OffsetType = typename TypeTraits<TestType>::OffsetType;
+ return TypeTraits<OffsetType>::type_singleton();
+ }
}
template <typename CType = const char*>
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index f2d4c29f0e..f9637a2f71 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -30,6 +30,7 @@
#include "arrow/compute/kernels/common.h"
#include "arrow/result.h"
#include "arrow/util/hashing.h"
+#include "arrow/util/unreachable.h"
namespace arrow {
@@ -261,7 +262,7 @@ class HashKernel : public KernelState {
// Base class for all "regular" hash kernel implementations
// (NullType has a separate implementation)
-template <typename Type, typename Scalar, typename Action,
+template <typename Type, typename Action, typename Scalar = typename Type::c_type,
bool with_error_status = Action::with_error_status>
class RegularHashKernel : public HashKernel {
public:
@@ -501,39 +502,13 @@ class DictionaryHashKernel : public HashKernel {
};
// ----------------------------------------------------------------------
-
-template <typename Type, typename Action, typename Enable = void>
-struct HashKernelTraits {};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_null<Type>> {
- using HashKernel = NullHashKernel<Action>;
-};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_has_c_type<Type>> {
- using HashKernel = RegularHashKernel<Type, typename Type::c_type, Action>;
-};
-
-template <typename Type, typename Action>
-struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
- using HashKernel = RegularHashKernel<Type, std::string_view, Action>;
-};
-
-template <typename Type, typename Action>
-Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
- const KernelInitArgs& args) {
- using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
- auto result = std::make_unique<HashKernelType>(args.inputs[0].GetSharedPtr(),
- args.options, ctx->memory_pool());
- RETURN_NOT_OK(result->Reset());
- return std::move(result);
-}
-
-template <typename Type, typename Action>
+template <typename HashKernel>
Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
const KernelInitArgs& args) {
- return HashInitImpl<Type, Action>(ctx, args);
+ auto result = std::make_unique<HashKernel>(args.inputs[0].GetSharedPtr(), args.options,
+ ctx->memory_pool());
+ RETURN_NOT_OK(result->Reset());
+ return std::move(result);
}
template <typename Action>
@@ -542,22 +517,22 @@ KernelInit GetHashInit(Type::type type_id) {
// representation
switch (type_id) {
case Type::NA:
- return HashInit<NullType, Action>;
+ return HashInit<NullHashKernel<Action>>;
case Type::BOOL:
- return HashInit<BooleanType, Action>;
+ return HashInit<RegularHashKernel<BooleanType, Action>>;
case Type::INT8:
case Type::UINT8:
- return HashInit<UInt8Type, Action>;
+ return HashInit<RegularHashKernel<UInt8Type, Action>>;
case Type::INT16:
case Type::UINT16:
- return HashInit<UInt16Type, Action>;
+ return HashInit<RegularHashKernel<UInt16Type, Action>>;
case Type::INT32:
case Type::UINT32:
case Type::FLOAT:
case Type::DATE32:
case Type::TIME32:
case Type::INTERVAL_MONTHS:
- return HashInit<UInt32Type, Action>;
+ return HashInit<RegularHashKernel<UInt32Type, Action>>;
case Type::INT64:
case Type::UINT64:
case Type::DOUBLE:
@@ -566,22 +541,23 @@ KernelInit GetHashInit(Type::type type_id) {
case Type::TIMESTAMP:
case Type::DURATION:
case Type::INTERVAL_DAY_TIME:
- return HashInit<UInt64Type, Action>;
+ return HashInit<RegularHashKernel<UInt64Type, Action>>;
case Type::BINARY:
case Type::STRING:
- return HashInit<BinaryType, Action>;
+ case Type::BINARY_VIEW:
+ case Type::STRING_VIEW:
+ return HashInit<RegularHashKernel<BinaryType, Action, std::string_view>>;
case Type::LARGE_BINARY:
case Type::LARGE_STRING:
- return HashInit<LargeBinaryType, Action>;
+ return HashInit<RegularHashKernel<LargeBinaryType, Action, std::string_view>>;
case Type::FIXED_SIZE_BINARY:
case Type::DECIMAL128:
case Type::DECIMAL256:
- return HashInit<FixedSizeBinaryType, Action>;
+ return HashInit<RegularHashKernel<FixedSizeBinaryType, Action, std::string_view>>;
case Type::INTERVAL_MONTH_DAY_NANO:
- return HashInit<MonthDayNanoIntervalType, Action>;
+ return HashInit<RegularHashKernel<MonthDayNanoIntervalType, Action>>;
default:
- DCHECK(false);
- return nullptr;
+ Unreachable("non hashable type");
}
}
@@ -591,31 +567,11 @@ template <typename Action>
Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
const KernelInitArgs& args) {
const auto& dict_type = checked_cast<const DictionaryType&>(*args.inputs[0].type);
- Result<std::unique_ptr<HashKernel>> indices_hasher;
- switch (dict_type.index_type()->id()) {
- case Type::INT8:
- case Type::UINT8:
- indices_hasher = HashInitImpl<UInt8Type, Action>(ctx, args);
- break;
- case Type::INT16:
- case Type::UINT16:
- indices_hasher = HashInitImpl<UInt16Type, Action>(ctx, args);
- break;
- case Type::INT32:
- case Type::UINT32:
- indices_hasher = HashInitImpl<UInt32Type, Action>(ctx, args);
- break;
- case Type::INT64:
- case Type::UINT64:
- indices_hasher = HashInitImpl<UInt64Type, Action>(ctx, args);
- break;
- default:
- DCHECK(false) << "Unsupported dictionary index type";
- break;
- }
- RETURN_NOT_OK(indices_hasher);
- return std::make_unique<DictionaryHashKernel>(std::move(indices_hasher.ValueOrDie()),
- dict_type.value_type());
+ ARROW_ASSIGN_OR_RAISE(auto indices_hasher,
+ GetHashInit<Action>(dict_type.index_type()->id())(ctx, args));
+ return std::make_unique<DictionaryHashKernel>(
+ checked_pointer_cast<HashKernel>(std::move(indices_hasher)),
+ dict_type.value_type());
}
Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index bfe8a49a9e..aca767907c 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -226,13 +226,11 @@ struct ScalarValidateImpl {
Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
- Status Visit(const BinaryViewScalar& s) {
- return Status::NotImplemented("Binary view");
- }
+ Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); }
- Status Visit(const StringViewScalar& s) {
- return Status::NotImplemented("String view");
- }
+ Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); }
+
+ Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); }
Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
@@ -499,14 +497,8 @@ Status Scalar::ValidateFull() const {
return ScalarValidateImpl(/*full_validation=*/true).Validate(*this);
}
-BinaryScalar::BinaryScalar(std::string s)
- : BinaryScalar(Buffer::FromString(std::move(s))) {}
-
-LargeBinaryScalar::LargeBinaryScalar(std::string s)
- : LargeBinaryScalar(Buffer::FromString(std::move(s))) {}
-
-LargeStringScalar::LargeStringScalar(std::string s)
- : LargeStringScalar(Buffer::FromString(std::move(s))) {}
+BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr<DataType> type)
+ : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {}
FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr<Buffer> value,
std::shared_ptr<DataType> type,
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 9f41ad0975..6042f0b434 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -253,6 +253,8 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
: internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
+
+ BaseBinaryScalar(std::string s, std::shared_ptr<DataType> type);
};
struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
@@ -262,7 +264,7 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
explicit BinaryScalar(std::shared_ptr<Buffer> value)
: BinaryScalar(std::move(value), binary()) {}
- explicit BinaryScalar(std::string s);
+ explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {}
BinaryScalar() : BinaryScalar(binary()) {}
};
@@ -274,6 +276,8 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
explicit StringScalar(std::shared_ptr<Buffer> value)
: StringScalar(std::move(value), utf8()) {}
+ explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {}
+
StringScalar() : StringScalar(utf8()) {}
};
@@ -284,6 +288,9 @@ struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar {
explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
: BinaryViewScalar(std::move(value), binary_view()) {}
+ explicit BinaryViewScalar(std::string s)
+ : BaseBinaryScalar(std::move(s), binary_view()) {}
+
BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
std::string_view view() const override { return std::string_view(*this->value); }
@@ -296,6 +303,9 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
explicit StringViewScalar(std::shared_ptr<Buffer> value)
: StringViewScalar(std::move(value), utf8_view()) {}
+ explicit StringViewScalar(std::string s)
+ : BinaryViewScalar(std::move(s), utf8_view()) {}
+
StringViewScalar() : StringViewScalar(utf8_view()) {}
};
@@ -309,7 +319,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
explicit LargeBinaryScalar(std::shared_ptr<Buffer> value)
: LargeBinaryScalar(std::move(value), large_binary()) {}
- explicit LargeBinaryScalar(std::string s);
+ explicit LargeBinaryScalar(std::string s)
+ : BaseBinaryScalar(std::move(s), large_binary()) {}
LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {}
};
@@ -321,7 +332,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar {
explicit LargeStringScalar(std::shared_ptr<Buffer> value)
: LargeStringScalar(std::move(value), large_utf8()) {}
- explicit LargeStringScalar(std::string s);
+ explicit LargeStringScalar(std::string s)
+ : LargeBinaryScalar(std::move(s), large_utf8()) {}
LargeStringScalar() : LargeStringScalar(large_utf8()) {}
};
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index fc319a6d10..4d29706829 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -177,12 +177,16 @@ using BaseBinaryArrowTypes =
::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
using BaseBinaryOrBinaryViewLikeArrowTypes =
- ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
+ ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, StringType,
+ LargeStringType, StringViewType>;
using BinaryArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
using StringArrowTypes = ::testing::Types<StringType, LargeStringType>;
+using StringOrStringViewArrowTypes =
+ ::testing::Types<StringType, LargeStringType, StringViewType>;
+
using ListArrowTypes = ::testing::Types<ListType, LargeListType>;
using UnionArrowTypes = ::testing::Types<SparseUnionType, DenseUnionType>;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index f4e082b3f6..faa2eb2af0 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -114,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout {
std::vector<BufferSpec> buffers;
/// Whether this type expects an associated dictionary array.
bool has_dictionary = false;
+ /// If this is provided, the number of buffers expected is only lower-bounded by
+ /// buffers.size(). Buffers beyond this lower bound are expected to conform to
+ /// variadic_spec.
+ std::optional<BufferSpec> variadic_spec;
- explicit DataTypeLayout(std::vector<BufferSpec> v) : buffers(std::move(v)) {}
+ explicit DataTypeLayout(std::vector<BufferSpec> buffers,
+ std::optional<BufferSpec> variadic_spec = {})
+ : buffers(std::move(buffers)), variadic_spec(variadic_spec) {}
};
/// \brief Base class for all data types
@@ -701,7 +707,8 @@ class ARROW_EXPORT BinaryViewType : public DataType {
DataTypeLayout layout() const override {
return DataTypeLayout(
- {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))});
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))},
+ DataTypeLayout::VariableWidth());
}
std::string ToString() const override;