You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/29 21:03:07 UTC
[arrow] 02/02: add cast to/from string_view
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 0eb21efb37458ba14427e3eef99c4d70848542d8
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Nov 29 16:02:29 2022 -0500
add cast to/from string_view
---
cpp/src/arrow/array/data.cc | 13 +-
cpp/src/arrow/array/data.h | 21 +-
cpp/src/arrow/array/validate.cc | 62 +++--
cpp/src/arrow/compute/exec.cc | 3 +
.../arrow/compute/kernels/scalar_cast_internal.cc | 12 +-
.../arrow/compute/kernels/scalar_cast_internal.h | 3 -
.../arrow/compute/kernels/scalar_cast_numeric.cc | 4 +-
.../arrow/compute/kernels/scalar_cast_string.cc | 277 ++++++++++++++-------
cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 66 +++--
9 files changed, 300 insertions(+), 161 deletions(-)
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 0cfa9fcd2e..eb8249077d 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -145,7 +145,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
}
this->offset = data.offset;
- for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
+ for (int i = 0; i < std::min(static_cast<int>(data.buffers.size()), 3); ++i) {
const std::shared_ptr<Buffer>& buffer = data.buffers[i];
// It is the invoker-of-kernels's responsibility to ensure that
// const buffers are not written to accidentally.
@@ -291,6 +291,17 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
}
this->buffers[2].data = const_cast<uint8_t*>(data_buffer);
this->buffers[2].size = data_size;
+ } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) {
+ const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
+ this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);
+ if (scalar.is_valid) {
+ *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {scalar.value->data(),
+ scalar.value->size()};
+ this->buffers[2].data = const_cast<uint8_t*>(scalar.value->data());
+ this->buffers[2].size = scalar.value->size();
+ } else {
+ *reinterpret_cast<StringHeader*>(this->buffers[1].data) = {};
+ }
} else if (type_id == Type::FIXED_SIZE_BINARY) {
const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
this->buffers[1].data = const_cast<uint8_t*>(scalar.value->data());
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index e024483f66..b9991bd959 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -356,10 +356,10 @@ struct ARROW_EXPORT ArraySpan {
void SetSlice(int64_t offset, int64_t length) {
this->offset = offset;
this->length = length;
- if (this->type->id() != Type::NA) {
- this->null_count = kUnknownNullCount;
- } else {
+ if (this->type->id() == Type::NA) {
this->null_count = this->length;
+ } else if (this->MayHaveNulls()) {
+ this->null_count = kUnknownNullCount;
}
}
@@ -375,6 +375,21 @@ struct ARROW_EXPORT ArraySpan {
namespace internal {
+template <typename F>
+Status VisitSlices(ArraySpan input, int64_t slice_size, const F& f) {
+ int64_t num_slices = input.length / slice_size;
+ int64_t trailing_slice_size = input.length % slice_size;
+ int64_t offset = input.offset;
+
+ for (int64_t i = 0; i < num_slices; ++i) {
+ input.SetSlice(offset, slice_size);
+ ARROW_RETURN_NOT_OK(f(input));
+ offset += slice_size;
+ }
+ input.SetSlice(offset, trailing_slice_size);
+ return f(input);
+}
+
void FillZeroLengthArray(const DataType* type, ArraySpan* span);
/// Construct a zero-copy view of this ArrayData with the given type.
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 2b83c84b28..4b10951724 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -171,39 +171,29 @@ namespace {
struct UTF8DataValidator {
const ArrayData& data;
- Status Visit(const DataType&) { Unreachable("utf-8 validation of non string type"); }
-
- Status Visit(const StringViewType&) {
- util::InitializeUTF8();
-
- const auto* values = data.GetValues<StringHeader>(1);
- for (int64_t i = 0; i < data.length; ++i) {
- if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(
- reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) {
- return Status::Invalid("Invalid UTF8 sequence at string index ", i);
- }
+ template <typename T>
+ Status Visit(const T&) {
+ if constexpr (std::is_same_v<T, StringType> || std::is_same_v<T, LargeStringType> ||
+ std::is_same_v<T, StringViewType>) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArraySpanInline<T>(
+ data,
+ [&](std::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
+ } else {
+ Unreachable("utf-8 validation of non string type");
}
- return Status::OK();
- }
-
- template <typename StringType>
- enable_if_string<StringType, Status> Visit(const StringType&) {
- util::InitializeUTF8();
-
- int64_t i = 0;
- return VisitArraySpanInline<StringType>(
- data,
- [&](std::string_view v) {
- if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
- return Status::Invalid("Invalid UTF8 sequence at string index ", i);
- }
- ++i;
- return Status::OK();
- },
- [&]() {
- ++i;
- return Status::OK();
- });
}
};
@@ -304,6 +294,14 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ Status Visit(const StringViewType& type) {
+ RETURN_NOT_OK(ValidateBinaryView(type));
+ if (full_validation) {
+ RETURN_NOT_OK(ValidateUTF8(data));
+ }
+ return Status::OK();
+ }
+
Status Visit(const Date64Type& type) {
RETURN_NOT_OK(ValidateFixedWidthBuffers());
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 8d3dcf0f2c..3b868cc716 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -232,6 +232,9 @@ void ComputeDataPreallocate(const DataType& type,
case Type::LARGE_LIST:
widths->emplace_back(64, /*added_length=*/1);
return;
+ case Type::BINARY_VIEW:
+ case Type::STRING_VIEW:
+ widths->emplace_back(8 * sizeof(StringHeader));
default:
break;
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 27a86135a6..8f5467ee84 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -170,12 +170,6 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
return Status::OK();
}
-Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
- const std::vector<TypeHolder>&) {
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- return options.to_type;
-}
-
/// You will see some of kernels with
///
/// kOutputTargetType
@@ -184,8 +178,10 @@ Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
/// easiest initial way to get the requested cast type including the TimeUnit
/// to the kernel (which is needed to compute the output) was through
/// CastOptions
-
-OutputType kOutputTargetType(ResolveOutputFromOptions);
+OutputType kOutputTargetType([](KernelContext* ctx,
+ const std::vector<TypeHolder>&) -> Result<TypeHolder> {
+ return CastState::Get(ctx).to_type;
+});
Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
// TODO(wesm): alternative strategy for zero copy casts after ARROW-16576
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 4d9afab199..bd8f41ea9f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -71,9 +71,6 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
CastFunction* func);
// OutputType::Resolver that returns a type the type from CastOptions
-Result<TypeHolder> ResolveOutputFromOptions(KernelContext* ctx,
- const std::vector<TypeHolder>& args);
-
ARROW_EXPORT extern OutputType kOutputTargetType;
// Add generic casts to out_ty from:
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 00c7cacf9c..e68b08c804 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -726,7 +726,7 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
}
std::shared_ptr<CastFunction> GetCastToDecimal128() {
- OutputType sig_out_ty(ResolveOutputFromOptions);
+ OutputType sig_out_ty = kOutputTargetType;
auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
@@ -761,7 +761,7 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
}
std::shared_ptr<CastFunction> GetCastToDecimal256() {
- OutputType sig_out_ty(ResolveOutputFromOptions);
+ OutputType sig_out_ty = kOutputTargetType;
auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 44e233f98c..68ba6268ae 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -25,8 +25,10 @@
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/compute/kernels/temporal_internal.h"
#include "arrow/result.h"
+#include "arrow/util/cpu_info.h"
#include "arrow/util/formatting.h"
#include "arrow/util/int_util.h"
+#include "arrow/util/unreachable.h"
#include "arrow/util/utf8_internal.h"
#include "arrow/visit_data_inline.h"
@@ -284,107 +286,192 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
}
template <typename O, typename I>
-enable_if_base_binary<I, Status> BinaryToBinaryCastExec(KernelContext* ctx,
- const ExecSpan& batch,
- ExecResult* out) {
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch,
+ ExecResult* out) {
const ArraySpan& input = batch[0].array;
- if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
+ // This presupposes that one was created in the invocation layer
+ ArrayData* output = out->array_data().get();
+ output->SetNullCount(input.null_count);
+
+ const auto& options = CastState::Get(ctx);
+ bool check_utf8 = !I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8;
+ if (check_utf8) {
InitializeUTF8();
- ArraySpanVisitor<I> visitor;
- Utf8Validator validator;
- RETURN_NOT_OK(visitor.Visit(input, &validator));
}
- // Start with a zero-copy cast, but change indices to expected size
- RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
- return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
- ctx, input, out->array_data().get());
-}
+ auto SimpleUtf8Validation = [&] {
+ if (check_utf8) {
+ Utf8Validator validator;
+ return ArraySpanVisitor<I>::Visit(input, &validator);
+ }
+ return Status::OK();
+ };
-template <typename O, typename I>
-enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
- !std::is_same<O, FixedSizeBinaryType>::value,
- Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- const ArraySpan& input = batch[0].array;
+ constexpr bool kInputOffsets =
+ std::is_base_of_v<BinaryType, I> || std::is_base_of_v<LargeBinaryType, I>;
- if (O::is_utf8 && !options.allow_invalid_utf8) {
- InitializeUTF8();
- ArraySpanVisitor<I> visitor;
- Utf8Validator validator;
- RETURN_NOT_OK(visitor.Visit(input, &validator));
+ constexpr bool kInputViews = std::is_base_of_v<BinaryViewType, I>;
+
+ constexpr bool kInputFixed = std::is_same_v<FixedSizeBinaryType, I>;
+
+ constexpr bool kOutputOffsets =
+ std::is_base_of_v<BinaryType, O> || std::is_base_of_v<LargeBinaryType, O>;
+
+ constexpr bool kOutputViews = std::is_base_of_v<BinaryViewType, O>;
+
+ constexpr bool kOutputFixed = std::is_same_v<FixedSizeBinaryType, O>;
+
+ if constexpr (kInputOffsets && kOutputOffsets) {
+ // FIXME(bkietz) this discards preallocated storage. It seems preferable to me to
+ // allocate a new null bitmap if necessary than to always allocate new offsets.
+ // Start with a zero-copy cast, but change indices to expected size
+ RETURN_NOT_OK(SimpleUtf8Validation());
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
+ ctx, input, out->array_data().get());
}
- // Check for overflow
- using output_offset_type = typename O::offset_type;
- constexpr output_offset_type kMaxOffset =
- std::numeric_limits<output_offset_type>::max();
- const int32_t width = input.type->byte_width();
- const int64_t max_offset = width * input.length;
- if (max_offset > kMaxOffset) {
- return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
- out->type()->ToString(), ": input array too large");
+ if constexpr (kInputViews && kOutputViews) {
+ return SimpleUtf8Validation() & ZeroCopyCastExec(ctx, batch, out);
}
- // This presupposes that one was created in the invocation layer
- ArrayData* output = out->array_data().get();
+ if constexpr (kInputViews && kOutputOffsets) {
+ // FIXME(bkietz) this discards preallocated offset storage
+ typename TypeTraits<O>::BuilderType builder{ctx->memory_pool()};
- // Copy buffers over, then generate indices
- output->length = input.length;
- output->SetNullCount(input.null_count);
- if (input.offset == output->offset) {
- output->buffers[0] = input.GetBuffer(0);
- } else {
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[0],
- arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
- input.offset, input.length));
- }
+ RETURN_NOT_OK(builder.Reserve(input.length));
+ // TODO(bkietz) if ArraySpan::buffers were a SmallVector, we could have access to all
+ // the character data buffers here and reserve character data accordingly.
+
+ // sweep through L1-sized chunks to reduce the frequency of allocation
+ int64_t chunk_size = ctx->exec_context()->cpu_info()->CacheSize(
+ ::arrow::internal::CpuInfo::CacheLevel::L1) /
+ sizeof(StringHeader) / 4;
+
+ RETURN_NOT_OK(::arrow::internal::VisitSlices(
+ input, chunk_size, [&](const ArraySpan& input_slice) {
+ int64_t num_chars = builder.value_data_length(), num_appended_chars = 0;
+ VisitArraySpanInline<I>(
+ input_slice,
+ [&](std::string_view v) {
+ num_appended_chars += static_cast<int64_t>(v.size());
+ },
+ [] {});
+
+ RETURN_NOT_OK(builder.ReserveData(num_appended_chars));
+
+ VisitArraySpanInline<I>(
+ input_slice, [&](std::string_view v) { builder.UnsafeAppend(v); },
+ [&] { builder.UnsafeAppendNull(); });
+
+ if (check_utf8) {
+ if (ARROW_PREDICT_FALSE(!ValidateUTF8Inline(builder.value_data() + num_chars,
+ num_appended_chars))) {
+ return Status::Invalid("Invalid UTF8 sequence");
+ }
+ }
+ return Status::OK();
+ }));
- // This buffer is preallocated
- output_offset_type* offsets = output->GetMutableValues<output_offset_type>(1);
- offsets[0] = static_cast<output_offset_type>(input.offset * width);
- for (int64_t i = 0; i < input.length; i++) {
- offsets[i + 1] = offsets[i] + width;
+ return builder.FinishInternal(std::get_if<std::shared_ptr<ArrayData>>(&out->value));
}
- // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary
- // (index 2). After ARROW-16757, we need to copy this memory instead of
- // zero-copy it because a Scalar value promoted to an ArraySpan may be
- // referencing a temporary buffer whose scope does not extend beyond the
- // kernel execution. In that scenario, the validity bitmap above can be
- // zero-copied because it points to static memory (either a byte with a 1 or
- // a 0 depending on whether the value is null or not).
- std::shared_ptr<Buffer> input_data = input.GetBuffer(1);
- if (input_data != nullptr) {
- ARROW_ASSIGN_OR_RAISE(output->buffers[2], input_data->CopySlice(0, input_data->size(),
- ctx->memory_pool()));
- } else {
- // TODO(wesm): it should already be nullptr, so we may be able to remove
- // this
- output->buffers[2] = nullptr;
+ if constexpr ((kInputOffsets || kInputFixed) && kOutputViews) {
+ // we can reuse the data buffer here and just add views which reference it
+ if (input.MayHaveNulls()) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
+ input.offset, input.length));
+ }
+ // FIXME(bkietz) segfault due to null buffer owner
+ // output->buffers[2] = input.GetBuffer(kInputFixed ? 1 : 2);
+
+ auto* headers = output->buffers[1]->mutable_data_as<StringHeader>();
+ if (check_utf8) {
+ Utf8Validator validator;
+ return VisitArraySpanInline<I>(
+ input,
+ [&](std::string_view v) {
+ *headers++ = StringHeader{v};
+ return validator.VisitValue(v);
+ },
+ [&] {
+ *headers++ = StringHeader{};
+ return Status::OK();
+ });
+ } else {
+ VisitArraySpanInline<I>(
+ input, [&](std::string_view v) { *headers++ = StringHeader{v}; },
+ [&] { *headers++ = StringHeader{}; });
+ return Status::OK();
+ }
}
- return Status::OK();
-}
+ if constexpr (kInputFixed && kOutputOffsets) {
+ RETURN_NOT_OK(SimpleUtf8Validation());
-template <typename O, typename I>
-enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
- std::is_same<O, FixedSizeBinaryType>::value,
- Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- const int32_t in_width = batch[0].type()->byte_width();
- const int32_t out_width =
- checked_cast<const FixedSizeBinaryType&>(*options.to_type).byte_width();
- if (in_width != out_width) {
- return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ",
- options.to_type.ToString(), ": widths must match");
+ using output_offset_type = typename O::offset_type;
+
+ int32_t width = input.type->byte_width();
+
+ if constexpr (std::is_same_v<output_offset_type, int32_t>) {
+ // Check for overflow
+ if (width * input.length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+ out->type()->ToString(), ": input array too large");
+ }
+ }
+
+ // Copy buffers over, then generate indices
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+ if (input.offset == output->offset) {
+ output->buffers[0] = input.GetBuffer(0);
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
+ input.offset, input.length));
+ }
+
+ // This buffer is preallocated
+ auto* offsets = output->buffers[1]->mutable_data_as<output_offset_type>();
+ offsets[0] = static_cast<output_offset_type>(input.offset * width);
+ for (int64_t i = 0; i < input.length; i++) {
+ offsets[i + 1] = offsets[i] + width;
+ }
+
+ // Data buffer (index 1) for FWBinary becomes data buffer for VarBinary
+ // (index 2). After ARROW-16757, we need to copy this memory instead of
+ // zero-copy it because a Scalar value promoted to an ArraySpan may be
+ // referencing a temporary buffer whose scope does not extend beyond the
+ // kernel execution. In that scenario, the validity bitmap above can be
+ // zero-copied because it points to static memory (either a byte with a 1 or
+ // a 0 depending on whether the value is null or not).
+ if (std::shared_ptr<Buffer> input_data = input.GetBuffer(1)) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[2],
+ input_data->CopySlice(0, input_data->size(), ctx->memory_pool()));
+ } else {
+ // TODO(wesm): it should already be nullptr, so we may be able to remove
+ // this
+ output->buffers[2] = nullptr;
+ }
+
+ return Status::OK();
}
- return ZeroCopyCastExec(ctx, batch, out);
+
+ if constexpr (kInputFixed && kOutputFixed) {
+ if (input.type->byte_width() != output->type->byte_width()) {
+ return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+ output->type->ToString(), ": widths must match");
+ }
+ return ZeroCopyCastExec(ctx, batch, out);
+ }
+
+ Unreachable();
}
#if defined(_MSC_VER)
@@ -447,6 +534,8 @@ template <typename OutType>
void AddBinaryToBinaryCast(CastFunction* func) {
AddBinaryToBinaryCast<OutType, StringType>(func);
AddBinaryToBinaryCast<OutType, BinaryType>(func);
+ AddBinaryToBinaryCast<OutType, StringViewType>(func);
+ AddBinaryToBinaryCast<OutType, BinaryViewType>(func);
AddBinaryToBinaryCast<OutType, LargeStringType>(func);
AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
AddBinaryToBinaryCast<OutType, FixedSizeBinaryType>(func);
@@ -459,6 +548,11 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
+ auto cast_binary_view =
+ std::make_shared<CastFunction>("cast_binary_view", Type::BINARY_VIEW);
+ AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get());
+ AddBinaryToBinaryCast<BinaryViewType>(cast_binary_view.get());
+
auto cast_large_binary =
std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
@@ -471,6 +565,14 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
AddTemporalToStringCasts<StringType>(cast_string.get());
AddBinaryToBinaryCast<StringType>(cast_string.get());
+ auto cast_string_view =
+ std::make_shared<CastFunction>("cast_string_view", Type::STRING_VIEW);
+ AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get());
+ AddNumberToStringCasts<StringViewType>(cast_string_view.get());
+ AddDecimalToStringCasts<StringViewType>(cast_string_view.get());
+ AddTemporalToStringCasts<StringViewType>(cast_string_view.get());
+ AddBinaryToBinaryCast<StringViewType>(cast_string_view.get());
+
auto cast_large_string =
std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
@@ -481,15 +583,16 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
auto cast_fsb =
std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
- AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
- cast_fsb.get());
+ AddCommonCasts(Type::FIXED_SIZE_BINARY, kOutputTargetType, cast_fsb.get());
DCHECK_OK(cast_fsb->AddKernel(
- Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)},
- OutputType(FirstType),
+ Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, kOutputTargetType,
BinaryToBinaryCastExec<FixedSizeBinaryType, FixedSizeBinaryType>,
NullHandling::COMPUTED_NO_PREALLOCATE));
- return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+ return {
+ cast_binary, cast_binary_view, cast_large_binary, cast_string,
+ cast_string_view, cast_large_string, cast_fsb,
+ };
}
} // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 85da81357b..9c10b85b3c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -145,7 +145,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input,
using arrow::internal::Bitmap;
Bitmap is_valid(masked->buffers[0], 0, input->length());
- if (auto original = input->null_bitmap()) {
+ if (const auto& original = input->null_bitmap()) {
is_valid.CopyFrom(Bitmap(original, input->offset(), input->length()));
} else {
is_valid.SetBitsTo(true);
@@ -154,7 +154,7 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input,
for (int i : indices_to_mask) {
is_valid.SetBitTo(i, false);
}
- return MakeArray(masked);
+ return MakeArray(std::move(masked));
}
TEST(Cast, CanCast) {
@@ -167,6 +167,9 @@ TEST(Cast, CanCast) {
}
};
+ ExpectCanCast(boolean(), {utf8()});
+ return;
+
auto ExpectCannotCast = [ExpectCanCast](std::shared_ptr<DataType> from,
std::vector<std::shared_ptr<DataType>> to_set) {
ExpectCanCast(from, to_set, /*expected=*/false);
@@ -198,17 +201,21 @@ TEST(Cast, CanCast) {
ExpectCannotCast(from_numeric, {null()});
}
- for (auto from_base_binary : kBaseBinaryTypes) {
- ExpectCanCast(from_base_binary, {boolean()});
- ExpectCanCast(from_base_binary, kNumericTypes);
- ExpectCanCast(from_base_binary, kBaseBinaryTypes);
- ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary});
+ auto base_binary_and_view_types = kBaseBinaryTypes;
+ base_binary_and_view_types.push_back(binary_view());
+ base_binary_and_view_types.push_back(utf8_view());
+
+ for (auto from : base_binary_and_view_types) {
+ ExpectCanCast(from, {boolean()});
+ ExpectCanCast(from, kNumericTypes);
+ ExpectCanCast(from, base_binary_and_view_types);
+ ExpectCanCast(dictionary(int64(), from), {from});
// any cast which is valid for the dictionary is valid for the DictionaryArray
- ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes);
- ExpectCanCast(dictionary(int16(), from_base_binary), kNumericTypes);
+ ExpectCanCast(dictionary(uint32(), from), kBaseBinaryTypes);
+ ExpectCanCast(dictionary(int16(), from), kNumericTypes);
- ExpectCannotCast(from_base_binary, {null()});
+ ExpectCannotCast(from, {null()});
}
ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)});
@@ -1029,7 +1036,7 @@ TEST(Cast, DecimalToFloating) {
}
TEST(Cast, DecimalToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) {
CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"),
ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])"));
@@ -1540,7 +1547,7 @@ TEST(Cast, TimeZeroCopy) {
}
TEST(Cast, DateToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(ArrayFromJSON(date32(), "[0, null]"),
ArrayFromJSON(string_type, R"(["1970-01-01", null])"));
CheckCast(ArrayFromJSON(date64(), "[86400000, null]"),
@@ -1549,7 +1556,7 @@ TEST(Cast, DateToString) {
}
TEST(Cast, TimeToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"),
ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])"));
CheckCast(
@@ -1559,7 +1566,7 @@ TEST(Cast, TimeToString) {
}
TEST(Cast, TimestampToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(
ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"),
ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])"));
@@ -1585,7 +1592,7 @@ TEST(Cast, TimestampToString) {
}
TEST_F(CastTimezone, TimestampWithZoneToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(
ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
ArrayFromJSON(string_type,
@@ -1771,7 +1778,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) {
}
TEST(Cast, DurationToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
for (auto unit : TimeUnit::values()) {
CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"),
ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])"));
@@ -2008,6 +2015,10 @@ TEST(Cast, StringToTimestamp) {
}
static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Array> rhs) {
+ for (auto id : {lhs->type_id(), rhs->type_id()}) {
+ // views cannot be zero copied
+ if (id == Type::BINARY_VIEW || id == Type::STRING_VIEW) return;
+ }
// null bitmap and data buffers are always zero-copied
AssertBufferSame(*lhs, *rhs, 0);
AssertBufferSame(*lhs, *rhs, 2);
@@ -2031,8 +2042,9 @@ static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Arr
}
TEST(Cast, BinaryToString) {
- for (auto bin_type : {binary(), large_binary()}) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto bin_type : {binary(), large_binary(), binary_view()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
+ ARROW_SCOPED_TRACE(*bin_type, " to ", *string_type);
// empty -> empty always works
CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]"));
@@ -2050,13 +2062,14 @@ TEST(Cast, BinaryToString) {
options.allow_invalid_utf8 = true;
ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options));
ASSERT_RAISES(Invalid, strings->ValidateFull());
+
AssertBinaryZeroCopy(invalid_utf8, strings);
}
}
auto from_type = fixed_size_binary(3);
auto invalid_utf8 = FixedSizeInvalidUtf8(from_type);
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]"));
// invalid utf-8 masked by a null bit is not an error
@@ -2075,9 +2088,12 @@ TEST(Cast, BinaryToString) {
// N.B. null buffer is not always the same if input sliced
AssertBufferSame(*invalid_utf8, *strings, 0);
- // ARROW-16757: we no longer zero copy, but the contents are equal
- ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
- ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+ if (string_type->id() != Type::STRING_VIEW) {
+ // ARROW-16757: we no longer zero copy, but the contents are equal
+ ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
+ strings->data()->buffers[2].get());
+ ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+ }
}
}
@@ -2146,7 +2162,7 @@ TEST(Cast, StringToString) {
}
TEST(Cast, IntToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"),
ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])"));
@@ -2178,7 +2194,7 @@ TEST(Cast, IntToString) {
}
TEST(Cast, FloatingToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(
ArrayFromJSON(float32(), "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"),
ArrayFromJSON(string_type, R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])"));
@@ -2190,7 +2206,7 @@ TEST(Cast, FloatingToString) {
}
TEST(Cast, BooleanToString) {
- for (auto string_type : {utf8(), large_utf8()}) {
+ for (auto string_type : {utf8(), large_utf8(), utf8_view()}) {
CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"),
ArrayFromJSON(string_type, R"(["true", "true", "false", null])"));
}