You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:45 UTC
[arrow] 01/15: Draft basic scaffolding for Binary/StringView types and get compiling
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 8e1c1442feebe9af2db607e50abd4b9bb900e3fb
Author: Wes McKinney <we...@apache.org>
AuthorDate: Fri Sep 9 16:35:27 2022 -0500
Draft basic scaffolding for Binary/StringView types and get compiling
---
LICENSE.txt | 16 +-
cpp/src/arrow/array/array_base.cc | 4 +
cpp/src/arrow/array/array_binary.cc | 12 +
cpp/src/arrow/array/array_binary.h | 58 +++++
cpp/src/arrow/array/builder_binary.cc | 86 +++++++
cpp/src/arrow/array/builder_binary.h | 248 +++++++++++++++++++++
cpp/src/arrow/array/builder_dict.cc | 6 +
cpp/src/arrow/array/builder_dict.h | 10 +
cpp/src/arrow/array/concatenate.cc | 4 +
cpp/src/arrow/array/util.cc | 13 ++
cpp/src/arrow/array/validate.cc | 20 +-
cpp/src/arrow/compare.cc | 13 +-
cpp/src/arrow/ipc/feather.cc | 4 +-
cpp/src/arrow/ipc/metadata_internal.cc | 10 +
cpp/src/arrow/ipc/reader.cc | 5 +
cpp/src/arrow/ipc/writer.cc | 4 +
cpp/src/arrow/json/test_common.h | 10 +-
cpp/src/arrow/scalar.cc | 14 ++
cpp/src/arrow/scalar.h | 29 +++
cpp/src/arrow/testing/json_internal.cc | 10 +-
cpp/src/arrow/type.cc | 16 +-
cpp/src/arrow/type.h | 46 ++++
cpp/src/arrow/type_fwd.h | 21 ++
cpp/src/arrow/type_test.cc | 12 +
cpp/src/arrow/type_traits.h | 57 ++++-
cpp/src/arrow/util/string_header.h | 219 ++++++++++++++++++
cpp/src/arrow/visitor.cc | 8 +-
cpp/src/arrow/visitor.h | 6 +
cpp/src/arrow/visitor_generate.h | 2 +
cpp/src/parquet/column_writer.cc | 1 +
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 38 +---
python/pyarrow/src/arrow/python/python_to_arrow.cc | 23 +-
32 files changed, 974 insertions(+), 51 deletions(-)
diff --git a/LICENSE.txt b/LICENSE.txt
index 86cfaf546c..d282bfe7b3 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1965,7 +1965,7 @@ This project includes code from the autobrew project.
The following files are based on code from the autobrew project:
* r/tools/autobrew
* dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
-* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb
+* dev/tasks/homebrew-formulae/autobrew/apache-arrow-static.rb
Copyright (c) 2019, Jeroen Ooms
License: MIT
@@ -2047,6 +2047,20 @@ License: http://www.apache.org/licenses/LICENSE-2.0
--------------------------------------------------------------------------------
+This project includes code from Velox.
+
+ * cpp/src/arrow/util/bytes_header.h
+
+is based on Velox's
+
+ * velox/type/StringView.h
+
+Copyright: Copyright (c) Facebook, Inc. and its affiliates.
+Home page: https://github.com/facebookincubator/velox
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
The file cpp/src/arrow/vendored/musl/strptime.c has the following license
Copyright © 2005-2020 Rich Felker, et al.
diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
index 5d27b2aedf..de9ab2e985 100644
--- a/cpp/src/arrow/array/array_base.cc
+++ b/cpp/src/arrow/array/array_base.cc
@@ -82,6 +82,10 @@ struct ScalarFromArraySlotImpl {
return Finish(a.GetString(index_));
}
+ Status Visit(const BinaryViewArray& a) {
+ return Status::NotImplemented("ScalarFromArraySlot -> BinaryView");
+ }
+
Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc
index 9466b5a48f..cfc467160a 100644
--- a/cpp/src/arrow/array/array_binary.cc
+++ b/cpp/src/arrow/array/array_binary.cc
@@ -89,6 +89,18 @@ LargeStringArray::LargeStringArray(int64_t length,
Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+BinaryViewArray::BinaryViewArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
+ SetData(data);
+}
+
+StringViewArray::StringViewArray(const std::shared_ptr<ArrayData>& data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
+ SetData(data);
+}
+
+Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+
FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}
diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h
index 7e58a96ff8..03ee77fab8 100644
--- a/cpp/src/arrow/array/array_binary.h
+++ b/cpp/src/arrow/array/array_binary.h
@@ -22,6 +22,7 @@
#include <cstdint>
#include <memory>
+#include <optional>
#include <string>
#include <string_view>
#include <vector>
@@ -217,6 +218,63 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
Status ValidateUTF8() const;
};
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+ using TypeClass = BinaryViewType;
+ using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+ explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+ BinaryViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : PrimitiveArray(binary_view(), length, data, null_bitmap, null_count, offset) {}
+
+ const StringHeader* raw_values() const {
+ return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
+ }
+
+ StringHeader Value(int64_t i) const { return raw_values()[i]; }
+
+ // For API compatibility with BinaryArray etc.
+ std::string_view GetView(int64_t i) const { return std::string_view(Value(i)); }
+
+ // EXPERIMENTAL
+ std::optional<std::string_view> operator[](int64_t i) const {
+ return *IteratorType(*this, i);
+ }
+
+ IteratorType begin() const { return IteratorType(*this); }
+ IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+ using PrimitiveArray::PrimitiveArray;
+};
+
+/// Concrete Array class for variable-size string view (utf-8) data using
+/// StringHeader to reference in-line or out-of-line string values
+class ARROW_EXPORT StringViewArray : public BinaryViewArray {
+ public:
+ using TypeClass = StringViewType;
+
+ explicit StringViewArray(const std::shared_ptr<ArrayData>& data);
+
+ StringViewArray(int64_t length, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : BinaryViewArray(utf8_view(), length, data, null_bitmap, null_count, offset) {}
+
+ /// \brief Validate that this array contains only valid UTF8 entries
+ ///
+ /// This check is also implied by ValidateFull()
+ Status ValidateUTF8() const;
+};
+
// ----------------------------------------------------------------------
// Fixed width binary
diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc
index 571f450aab..e0a7bc1193 100644
--- a/cpp/src/arrow/array/builder_binary.cc
+++ b/cpp/src/arrow/array/builder_binary.cc
@@ -40,6 +40,92 @@ namespace arrow {
using internal::checked_cast;
+// ----------------------------------------------------------------------
+// Binary/StringView
+
+Status BinaryViewBuilder::AppendValues(const std::vector<std::string>& values,
+ const uint8_t* valid_bytes) {
+ // We only need to allocate memory for the out-of-line strings
+ std::size_t out_of_line_total = std::accumulate(
+ values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) {
+ size_t length = str.size();
+ return sum + (length > StringHeader::kInlineSize ? length : 0);
+ });
+ RETURN_NOT_OK(Reserve(values.size()));
+ RETURN_NOT_OK(ReserveData(out_of_line_total));
+
+ if (valid_bytes != nullptr) {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ if (valid_bytes[i]) {
+ UnsafeAppend(values[i]);
+ } else {
+ UnsafeAppendNull();
+ }
+ }
+ } else {
+ for (std::size_t i = 0; i < values.size(); ++i) {
+ UnsafeAppend(values[i]);
+ }
+ }
+ UnsafeAppendToBitmap(valid_bytes, values.size());
+ return Status::OK();
+}
+
+Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset,
+ int64_t length) {
+ auto bitmap = array.GetValues<uint8_t>(0, 0);
+ auto values = array.GetValues<StringHeader>(1) + offset;
+
+ int64_t out_of_line_total = 0;
+ for (int64_t i = 0; i < length; i++) {
+ if (!values[i].IsInline()) {
+ out_of_line_total += static_cast<int64_t>(values[i].size());
+ }
+ }
+ RETURN_NOT_OK(Reserve(length));
+ RETURN_NOT_OK(ReserveData(out_of_line_total));
+ for (int64_t i = 0; i < length; i++) {
+ if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
+ if (values[i].IsInline()) {
+ UnsafeAppend(values[i]);
+ } else {
+ UnsafeAppend(values[i].data(), values[i].size());
+ }
+ } else {
+ UnsafeAppendNull();
+ }
+ }
+ return Status::OK();
+}
+
+Status BinaryViewBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ BufferVector buffers = {null_bitmap, data};
+ for (auto&& buffer : data_heap_builder_.Finish()) {
+ buffers.push_back(std::move(buffer));
+ }
+ *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ Reset();
+ return Status::OK();
+}
+
+Status BinaryViewBuilder::ReserveData(int64_t length) {
+ if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) {
+ return Status::CapacityError(
+ "BinaryView or StringView elements cannot reference "
+ "strings larger than 4GB");
+ }
+ return data_heap_builder_.Reserve(length);
+}
+
+void BinaryViewBuilder::Reset() {
+ ArrayBuilder::Reset();
+ data_builder_.Reset();
+ data_heap_builder_.Reset();
+}
+
// ----------------------------------------------------------------------
// Fixed width binary
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 25183ca169..c716e6d225 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -459,6 +459,254 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
std::shared_ptr<DataType> type() const override { return large_utf8(); }
};
+// ----------------------------------------------------------------------
+// BinaryViewBuilder, StringViewBuilder
+//
+// The builders permit two styles of use: one where appended data is
+// accumulated in a third buffer that is appended to the resulting ArrayData,
+// and one where only the StringHeaders are appended. If you only want to
+// append StringHeaders, then use the Append(const StringHeader&) methods
+
+namespace internal {
+
+// Because we construct StringHeader objects incrementally, resizing buffers is
+// not an option as memory addresses for out-of-line strings will change. Thus,
+// we allocate medium-sized memory chunks and accumulate data in those, which
+// may result in some waste if there are many large-ish strings. If a string
+// comes along that does not fit into a block, we allocate a new block and
+// write into that.
+//
+// Later we can implement optimizations to continuing filling underfull blocks
+// after encountering a large string that required allocating a new block.
+class ARROW_EXPORT StringHeapBuilder {
+ public:
+ static constexpr int64_t kDefaultBlocksize = 1 << 20; // 1MB
+
+ StringHeapBuilder(MemoryPool* pool, int64_t blocksize = kDefaultBlocksize)
+ : pool_(pool), blocksize_(blocksize) {}
+
+ const uint8_t* UnsafeAppend(const uint8_t* data, int64_t num_bytes) {
+ memcpy(current_out_buffer_, data, static_cast<size_t>(num_bytes));
+ const uint8_t* result = current_out_buffer_;
+ current_out_buffer_ += num_bytes;
+ current_remaining_bytes_ -= num_bytes;
+ return result;
+ }
+
+ Result<const uint8_t*> Append(const uint8_t* data, int64_t num_bytes) {
+ if (num_bytes > current_remaining_bytes_) {
+ ARROW_RETURN_NOT_OK(Reserve(num_bytes));
+ }
+ return UnsafeAppend(data, num_bytes);
+ }
+
+ /// \brief Ensure that the indicated number of bytes can be appended via
+ /// UnsafeAppend operations without the need to allocate more memory
+ Status Reserve(int64_t num_bytes) {
+ if (num_bytes > current_remaining_bytes_) {
+ current_remaining_bytes_ =
+ num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize;
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
+ AllocateBuffer(current_remaining_bytes_, pool_));
+ current_out_buffer_ = new_block->mutable_data();
+ blocks_.emplace_back(std::move(new_block));
+ }
+ return Status::OK();
+ }
+
+ void Reset() {
+ current_out_buffer_ = nullptr;
+ current_remaining_bytes_ = 0;
+ blocks_.clear();
+ }
+
+ int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
+
+ std::vector<std::shared_ptr<Buffer>> Finish() {
+ current_out_buffer_ = nullptr;
+ current_remaining_bytes_ = 0;
+ return std::move(blocks_);
+ }
+
+ private:
+ MemoryPool* pool_;
+ const int64_t blocksize_;
+ std::vector<std::shared_ptr<Buffer>> blocks_;
+
+ uint8_t* current_out_buffer_ = nullptr;
+ int64_t current_remaining_bytes_ = 0;
+};
+
+} // namespace internal
+
+class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
+ public:
+ using TypeClass = BinaryViewType;
+
+ BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : BinaryViewBuilder(pool) {}
+
+ int64_t current_block_bytes_remaining() const {
+ return data_heap_builder_.current_remaining_bytes();
+ }
+
+ Status Append(const uint8_t* value, int64_t length) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ if (length > static_cast<int64_t>(StringHeader::kInlineSize)) {
+ // String is stored out-of-line
+ if (ARROW_PREDICT_FALSE(length > ValueSizeLimit())) {
+ return Status::CapacityError(
+ "BinaryView or StringView elements cannot reference "
+ "strings larger than 4GB");
+ }
+ // Overwrite 'value' since we will use that for the StringHeader value below
+ ARROW_ASSIGN_OR_RAISE(value, data_heap_builder_.Append(value, length));
+ }
+ UnsafeAppend(StringHeader(value, length));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ Status Append(const char* value, int64_t length) {
+ return Append(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ Status Append(std::string_view value) {
+ return Append(value.data(), static_cast<int64_t>(value.size()));
+ }
+
+ Status Append(StringHeader value) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(value);
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ /// \brief Append without checking capacity
+ ///
+ /// Builder should have been presized using Reserve() and ReserveData(),
+ /// respectively, and the value must not be larger than 4GB
+ void UnsafeAppend(const uint8_t* value, int64_t length) {
+ if (length > static_cast<int64_t>(StringHeader::kInlineSize)) {
+ // String is stored out-of-line
+ // Overwrite 'value' since we will use that for the StringHeader value below
+ value = data_heap_builder_.UnsafeAppend(value, length);
+ }
+ UnsafeAppend(StringHeader(value, length));
+ UnsafeAppendToBitmap(true);
+ }
+
+ void UnsafeAppend(const char* value, int64_t length) {
+ UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ void UnsafeAppend(const std::string& value) {
+ UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
+ }
+
+ void UnsafeAppend(std::string_view value) {
+ UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
+ }
+
+ void UnsafeAppend(StringHeader value) {
+ data_builder_.UnsafeAppend(value);
+ UnsafeAppendToBitmap(true);
+ }
+
+ /// \brief Ensures there is enough allocated available capacity in the
+ /// out-of-line data heap to append the indicated number of bytes without
+ /// additional allocations
+ Status ReserveData(int64_t length);
+
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, StringHeader()); // zero
+ UnsafeSetNull(length);
+ return Status::OK();
+ }
+
+ /// \brief Append a single null element
+ Status AppendNull() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(StringHeader()); // zero
+ UnsafeAppendToBitmap(false);
+ return Status::OK();
+ }
+
+ /// \brief Append a empty element (length-0 inline string)
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(StringHeader("")); // zero
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ /// \brief Append several empty elements
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, StringHeader(""));
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ void UnsafeAppendNull() {
+ data_builder_.UnsafeAppend(StringHeader());
+ UnsafeAppendToBitmap(false);
+ }
+
+ void UnsafeAppendEmptyValue() {
+ data_builder_.UnsafeAppend(StringHeader(""));
+ UnsafeAppendToBitmap(true);
+ }
+
+ /// \brief Append a sequence of strings in one shot.
+ ///
+ /// \param[in] values a vector of strings
+ /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+ /// indicates a valid (non-null) value
+ /// \return Status
+ Status AppendValues(const std::vector<std::string>& values,
+ const uint8_t* valid_bytes = NULLPTR);
+
+ /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
+ /// the underlying out-of-line string memory to avoid memory lifetime issues
+ Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+ int64_t length) override;
+
+ void Reset() override;
+
+ Status Resize(int64_t capacity) override {
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+ capacity = std::max(capacity, kMinBuilderCapacity);
+ ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+ return ArrayBuilder::Resize(capacity);
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ std::shared_ptr<DataType> type() const override { return binary_view(); }
+
+ protected:
+ explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool())
+ : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {}
+
+ static constexpr int64_t ValueSizeLimit() {
+ return std::numeric_limits<uint32_t>::max();
+ }
+
+ TypedBufferBuilder<StringHeader> data_builder_;
+
+ // Accumulates out-of-line data in fixed-size chunks which are then attached
+ // to the resulting ArrayData
+ internal::StringHeapBuilder data_heap_builder_;
+};
+
+class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
+ public:
+ using BinaryViewBuilder::BinaryViewBuilder;
+ std::shared_ptr<DataType> type() const override { return utf8_view(); }
+};
+
// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 061fb60041..c99a6facee 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -193,6 +193,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu
return impl_->GetOrInsert<BinaryType>(value, out);
}
+Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value,
+ int32_t* out) {
+ // Create BinaryArray dictionary for now
+ return impl_->GetOrInsert<BinaryType>(value, out);
+}
+
Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value,
int32_t* out) {
return impl_->GetOrInsert<LargeBinaryType>(value, out);
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index cb0aaf3099..0cc82930a1 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -60,6 +60,12 @@ struct DictionaryValue<T, enable_if_base_binary<T>> {
BinaryType, LargeBinaryType>::type;
};
+template <typename T>
+struct DictionaryValue<T, enable_if_binary_view_like<T>> {
+ using type = std::string_view;
+ using PhysicalType = BinaryViewType;
+};
+
template <typename T>
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
using type = std::string_view;
@@ -115,6 +121,10 @@ class ARROW_EXPORT DictionaryMemoTable {
Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
+ // TODO: Consider working StringHeader throughout the hashing machinery to
+ // benefit from faster comparisons, reduced need to allocate memory
+ Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
+
class DictionaryMemoTableImpl;
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index aab734284f..3dd0ccea93 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -227,6 +227,10 @@ class ConcatenateImpl {
return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
}
+ Status Visit(const BinaryViewType&) {
+ return Status::NotImplemented("binary / string view");
+ }
+
Status Visit(const ListType&) {
std::vector<Range> value_ranges;
ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index c0cdcab730..ac9d76d469 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -264,6 +264,14 @@ class ArrayDataEndianSwapper {
return Status::OK();
}
+ template <typename T>
+ enable_if_t<std::is_same<BinaryViewType, T>::value ||
+ std::is_same<StringViewType, T>::value,
+ Status>
+ Visit(const T& type) {
+ return Status::NotImplemented("Binary / string view");
+ }
+
Status Visit(const ListType& type) {
RETURN_NOT_OK(SwapOffsets<int32_t>(1));
return Status::OK();
@@ -596,6 +604,11 @@ class RepeatedArrayFactory {
return Status::OK();
}
+ template <typename T>
+ enable_if_binary_view_like<T, Status> Visit(const T&) {
+ return Status::NotImplemented("binary / string view");
+ }
+
template <typename T>
enable_if_var_size_list<T, Status> Visit(const T& type) {
using ScalarType = typename TypeTraits<T>::ScalarType;
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 56470ac74b..cddb086005 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -47,6 +47,19 @@ struct UTF8DataValidator {
return Status::NotImplemented("");
}
+ Status Visit(const StringViewType&) {
+ util::InitializeUTF8();
+
+ const auto* values = data.GetValues<StringHeader>(1);
+ for (int64_t i = 0; i < data.length; ++i) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(
+ reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ }
+ return Status::OK();
+ }
+
template <typename StringType>
enable_if_string<StringType, Status> Visit(const StringType&) {
util::InitializeUTF8();
@@ -247,6 +260,10 @@ struct ValidateArrayImpl {
Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const BinaryViewType& type) {
+ return Status::NotImplemented("binary / string view");
+ }
+
Status Visit(const ListType& type) { return ValidateListLike(type); }
Status Visit(const LargeListType& type) { return ValidateListLike(type); }
@@ -716,7 +733,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d
ARROW_EXPORT
Status ValidateUTF8(const ArrayData& data) {
- DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
+ DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW ||
+ data.type->id() == Type::LARGE_STRING);
UTF8DataValidator validator{data};
return VisitTypeInline(*data.type, &validator);
}
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index baadd10cca..8ccc645046 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -259,6 +259,11 @@ class RangeDataEqualsImpl {
// Also matches StringType
Status Visit(const BinaryType& type) { return CompareBinary(type); }
+ // Also matches StringViewType
+ Status Visit(const BinaryViewType& type) {
+ return Status::NotImplemented("Binary / string view");
+ }
+
// Also matches LargeStringType
Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
@@ -577,7 +582,7 @@ class TypeEqualsVisitor {
template <typename T>
enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value ||
- is_base_binary_type<T>::value,
+ is_base_binary_type<T>::value || is_binary_view_like_type<T>::value,
Status>
Visit(const T&) {
result_ = true;
@@ -729,6 +734,12 @@ class ScalarEqualsVisitor {
return Status::OK();
}
+ Status Visit(const BinaryViewScalar& left) {
+ const auto& right = checked_cast<const BinaryViewScalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
Status Visit(const Decimal128Scalar& left) {
const auto& right = checked_cast<const Decimal128Scalar&>(right_);
result_ = left.value == right.value;
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index b6d3a3d7d8..1ef076fac4 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -536,8 +536,8 @@ struct ArrayWriterV1 {
is_nested_type<T>::value || is_null_type<T>::value || is_decimal_type<T>::value ||
std::is_same<DictionaryType, T>::value || is_duration_type<T>::value ||
is_interval_type<T>::value || is_fixed_size_binary_type<T>::value ||
- std::is_same<Date64Type, T>::value || std::is_same<Time64Type, T>::value ||
- std::is_same<ExtensionType, T>::value,
+ is_binary_view_like_type<T>::value || std::is_same<Date64Type, T>::value ||
+ std::is_same<Time64Type, T>::value || std::is_same<ExtensionType, T>::value,
Status>::type
Visit(const T& type) {
return Status::NotImplemented(type.ToString());
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index 2e450b9d46..367b31d5dd 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -523,6 +523,16 @@ class FieldToFlatbufferVisitor {
return Status::OK();
}
+ Status Visit(const BinaryViewType& type) {
+ // BinaryView will be written to IPC as a normal binary array
+ return Visit(BinaryType());
+ }
+
+ Status Visit(const StringViewType& type) {
+ // StringView will be written to IPC as a normal UTF8 string array
+ return Visit(StringType());
+ }
+
Status Visit(const LargeBinaryType& type) {
fb_type_ = flatbuf::Type::LargeBinary;
type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union();
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index a1b17afaaf..843d5917b3 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -348,6 +348,11 @@ class ArrayLoader {
return LoadBinary<T>(type.id());
}
+ Status Visit(const BinaryViewType& type) {
+ DCHECK(false);
+ return Status::NotImplemented("Reading IPC format to binary view is not supported");
+ }
+
Status Visit(const FixedSizeBinaryType& type) {
out_->buffers.resize(2);
RETURN_NOT_OK(LoadCommon(type.id()));
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index b89604e6fe..d68da651f3 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -388,6 +388,10 @@ class RecordBatchSerializer {
return Status::OK();
}
+ Status Visit(const BinaryViewArray& array) {
+ return Status::NotImplemented("Binary / string view type");
+ }
+
Status Visit(const FixedSizeListArray& array) {
--max_recursion_depth_;
auto size = array.list_type()->list_size();
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index c01036047c..86a03c82ab 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -110,8 +110,7 @@ struct GenerateImpl {
return OK(writer.Double(val));
}
- template <typename T>
- enable_if_base_binary<T, Status> Visit(const T&) {
+ Status GenerateAscii(const DataType&) {
auto size = std::poisson_distribution<>{4}(e);
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
std::string s(size, '\0');
@@ -119,6 +118,13 @@ struct GenerateImpl {
return OK(writer.String(s.c_str()));
}
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T& t) {
+ return GenerateAscii(t);
+ }
+
+ Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+
template <typename T>
enable_if_list_like<T, Status> Visit(const T& t) {
auto size = std::poisson_distribution<>{4}(e);
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index 0ca08d7a82..d139845bd7 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -70,6 +70,12 @@ struct ScalarHashImpl {
Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
+ Status Visit(const BinaryViewScalar& s) {
+ const StringHeader& v = s.value;
+ hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size());
+ return Status::OK();
+ }
+
template <typename T>
Status Visit(const TemporalScalar<T>& s) {
return ValueHash(s);
@@ -226,6 +232,14 @@ struct ScalarValidateImpl {
Status Visit(const StringScalar& s) { return ValidateStringScalar(s); }
+ Status Visit(const BinaryViewScalar& s) {
+ return Status::NotImplemented("Binary view");
+ }
+
+ Status Visit(const StringViewScalar& s) {
+ return Status::NotImplemented("String view");
+ }
+
Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); }
template <typename ScalarType>
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index cf852dff36..9b7f604132 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -37,6 +37,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/compare.h"
#include "arrow/util/decimal.h"
+#include "arrow/util/string_header.h"
#include "arrow/util/visibility.h"
#include "arrow/visit_type_inline.h"
@@ -282,6 +283,34 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
StringScalar() : StringScalar(utf8()) {}
};
+struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase {
+ using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+ using TypeClass = BinaryViewType;
+
+ explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type)
+ : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
+
+ explicit BinaryViewScalar(StringHeader value)
+ : BinaryViewScalar(value, binary_view()) {}
+
+ BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {}
+
+ void* mutable_data() override { return reinterpret_cast<void*>(&this->value); }
+
+ std::string_view view() const override { return std::string_view(this->value); }
+
+ StringHeader value;
+};
+
+struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+ using TypeClass = StringViewType;
+
+ explicit StringViewScalar(StringHeader value)
+ : BinaryViewScalar(std::move(value), utf8_view()) {}
+
+ StringViewScalar() : BinaryViewScalar(utf8_view()) {}
+};
+
struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
using BaseBinaryScalar::BaseBinaryScalar;
using TypeClass = LargeBinaryType;
diff --git a/cpp/src/arrow/testing/json_internal.cc b/cpp/src/arrow/testing/json_internal.cc
index c1d45aa2e0..a296e0fba7 100644
--- a/cpp/src/arrow/testing/json_internal.cc
+++ b/cpp/src/arrow/testing/json_internal.cc
@@ -227,8 +227,8 @@ class SchemaWriter {
template <typename T>
enable_if_t<is_null_type<T>::value || is_primitive_ctype<T>::value ||
- is_base_binary_type<T>::value || is_base_list_type<T>::value ||
- is_struct_type<T>::value>
+ is_base_binary_type<T>::value || is_binary_view_like_type<T>::value ||
+ is_base_list_type<T>::value || is_struct_type<T>::value>
WriteTypeMetadata(const T& type) {}
void WriteTypeMetadata(const MapType& type) {
@@ -386,6 +386,8 @@ class SchemaWriter {
Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); }
+ Status Visit(const StringViewType& type) { return WritePrimitive("utf8_view", type); }
+ Status Visit(const BinaryViewType& type) { return WritePrimitive("binary_view", type); }
Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); }
Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); }
Status Visit(const FixedSizeBinaryType& type) {
@@ -1320,6 +1322,10 @@ class ArrayReader {
return FinishBuilder(&builder);
}
+ Status Visit(const BinaryViewType& type) {
+ return Status::NotImplemented("Binary / string view");
+ }
+
Status Visit(const DayTimeIntervalType& type) {
DayTimeIntervalBuilder builder(pool_);
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index ea9525404c..b976260ccd 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -59,10 +59,14 @@ constexpr Type::type FixedSizeListType::type_id;
constexpr Type::type BinaryType::type_id;
+constexpr Type::type BinaryViewType::type_id;
+
constexpr Type::type LargeBinaryType::type_id;
constexpr Type::type StringType::type_id;
+constexpr Type::type StringViewType::type_id;
+
constexpr Type::type LargeStringType::type_id;
constexpr Type::type FixedSizeBinaryType::type_id;
@@ -188,7 +192,9 @@ std::string ToString(Type::type id) {
TO_STRING_CASE(INTERVAL_MONTHS)
TO_STRING_CASE(DURATION)
TO_STRING_CASE(STRING)
+ TO_STRING_CASE(STRING_VIEW)
TO_STRING_CASE(BINARY)
+ TO_STRING_CASE(BINARY_VIEW)
TO_STRING_CASE(LARGE_STRING)
TO_STRING_CASE(LARGE_BINARY)
TO_STRING_CASE(FIXED_SIZE_BINARY)
@@ -564,10 +570,14 @@ std::string FixedSizeListType::ToString() const {
std::string BinaryType::ToString() const { return "binary"; }
+std::string BinaryViewType::ToString() const { return "binary_view"; }
+
std::string LargeBinaryType::ToString() const { return "large_binary"; }
std::string StringType::ToString() const { return "string"; }
+std::string StringViewType::ToString() const { return "string_view"; }
+
std::string LargeStringType::ToString() const { return "large_string"; }
int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
@@ -2114,8 +2124,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat)
PARAMETER_LESS_FINGERPRINT(Float)
PARAMETER_LESS_FINGERPRINT(Double)
PARAMETER_LESS_FINGERPRINT(Binary)
+PARAMETER_LESS_FINGERPRINT(BinaryView)
PARAMETER_LESS_FINGERPRINT(LargeBinary)
PARAMETER_LESS_FINGERPRINT(String)
+PARAMETER_LESS_FINGERPRINT(StringView)
PARAMETER_LESS_FINGERPRINT(LargeString)
PARAMETER_LESS_FINGERPRINT(Date32)
PARAMETER_LESS_FINGERPRINT(Date64)
@@ -2283,8 +2295,10 @@ TYPE_FACTORY(float16, HalfFloatType)
TYPE_FACTORY(float32, FloatType)
TYPE_FACTORY(float64, DoubleType)
TYPE_FACTORY(utf8, StringType)
+TYPE_FACTORY(utf8_view, StringViewType)
TYPE_FACTORY(large_utf8, LargeStringType)
TYPE_FACTORY(binary, BinaryType)
+TYPE_FACTORY(binary_view, BinaryViewType)
TYPE_FACTORY(large_binary, LargeBinaryType)
TYPE_FACTORY(date64, Date64Type)
TYPE_FACTORY(date32, Date32Type)
@@ -2532,7 +2546,7 @@ void InitStaticData() {
// * Time32
// * Time64
// * Timestamp
- g_primitive_types = {null(), boolean(), date32(), date64()};
+ g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()};
Extend(g_numeric_types, &g_primitive_types);
Extend(g_base_binary_types, &g_primitive_types);
}
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 415aaacf1c..f4e082b3f6 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -33,6 +33,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
+#include "arrow/util/string_header.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor.h" // IWYU pragma: keep
@@ -686,6 +687,33 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
};
+/// \brief Concrete type class for variable-size binary view data using
+/// StringHeader structs
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+ static constexpr Type::type type_id = Type::BINARY_VIEW;
+ static constexpr bool is_utf8 = false;
+ using PhysicalType = BinaryViewType;
+
+ static constexpr const char* type_name() { return "binary_view"; }
+
+ BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {}
+
+ DataTypeLayout layout() const override {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(StringHeader))});
+ }
+
+ std::string ToString() const override;
+ std::string name() const override { return "binary_view"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+
+ // Allow subclasses like StringType to change the logical type.
+ explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {}
+};
+
/// \brief Concrete type class for large variable-size binary data
class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
public:
@@ -732,6 +760,24 @@ class ARROW_EXPORT StringType : public BinaryType {
std::string ComputeFingerprint() const override;
};
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringViewType : public BinaryViewType {
+ public:
+ static constexpr Type::type type_id = Type::STRING_VIEW;
+ static constexpr bool is_utf8 = true;
+ using PhysicalType = BinaryViewType;
+
+ static constexpr const char* type_name() { return "utf8_view"; }
+
+ StringViewType() : BinaryViewType(Type::STRING_VIEW) {}
+
+ std::string ToString() const override;
+ std::string name() const override { return "utf8_view"; }
+
+ protected:
+ std::string ComputeFingerprint() const override;
+};
+
/// \brief Concrete type class for large variable-size string data, utf8-encoded
class ARROW_EXPORT LargeStringType : public LargeBinaryType {
public:
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index ba0e635f73..1066d50321 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -108,6 +108,11 @@ class BinaryArray;
class BinaryBuilder;
struct BinaryScalar;
+class BinaryViewType;
+class BinaryViewArray;
+class BinaryViewBuilder;
+struct BinaryViewScalar;
+
class LargeBinaryType;
class LargeBinaryArray;
class LargeBinaryBuilder;
@@ -123,6 +128,11 @@ class StringArray;
class StringBuilder;
struct StringScalar;
+class StringViewType;
+class StringViewArray;
+class StringViewBuilder;
+struct StringViewScalar;
+
class LargeStringType;
class LargeStringArray;
class LargeStringBuilder;
@@ -405,6 +415,13 @@ struct Type {
/// Calendar interval type with three fields.
INTERVAL_MONTH_DAY_NANO,
+ /// String (UTF8) view type with 4-byte prefix and inline small string
+ /// optimization
+ STRING_VIEW,
+
+ /// Bytes view type with 4-byte prefix and inline small string optimization
+ BINARY_VIEW,
+
// Leave this at the end
MAX_ID
};
@@ -446,10 +463,14 @@ ARROW_EXPORT const std::shared_ptr<DataType>& float32();
ARROW_EXPORT const std::shared_ptr<DataType>& float64();
/// \brief Return a StringType instance
ARROW_EXPORT const std::shared_ptr<DataType>& utf8();
+/// \brief Return a StringViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& utf8_view();
/// \brief Return a LargeStringType instance
ARROW_EXPORT const std::shared_ptr<DataType>& large_utf8();
/// \brief Return a BinaryType instance
ARROW_EXPORT const std::shared_ptr<DataType>& binary();
+/// \brief Return a BinaryViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& binary_view();
/// \brief Return a LargeBinaryType instance
ARROW_EXPORT const std::shared_ptr<DataType>& large_binary();
/// \brief Return a Date32Type instance
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index 954ad63c8a..ad0804be8b 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -1189,9 +1189,21 @@ TEST(TestBinaryType, ToString) {
TEST(TestStringType, ToString) {
StringType str;
ASSERT_EQ(str.id(), Type::STRING);
+ ASSERT_EQ(str.name(), std::string("utf8"));
+ ASSERT_EQ(str.type_name(), std::string("utf8"));
ASSERT_EQ(str.ToString(), std::string("string"));
}
+TEST(TestBinaryViewType, ToString) {
+ BinaryViewType t1;
+ BinaryViewType e1;
+ StringViewType t2;
+ AssertTypeEqual(t1, e1);
+ AssertTypeNotEqual(t1, t2);
+ ASSERT_EQ(t1.id(), Type::BINARY_VIEW);
+ ASSERT_EQ(t1.ToString(), std::string("binary_view"));
+}
+
TEST(TestLargeBinaryTypes, ToString) {
BinaryType bt1;
LargeBinaryType t1;
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 5873969066..dcd7c36ba2 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -341,6 +341,16 @@ struct TypeTraits<BinaryType> {
static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
};
+template <>
+struct TypeTraits<BinaryViewType> {
+ using ArrayType = BinaryViewArray;
+ using BuilderType = BinaryViewBuilder;
+ using ScalarType = BinaryViewScalar;
+ using CType = StringHeader;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return binary_view(); }
+};
+
template <>
struct TypeTraits<LargeBinaryType> {
using ArrayType = LargeBinaryArray;
@@ -371,6 +381,16 @@ struct TypeTraits<StringType> {
static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
};
+template <>
+struct TypeTraits<StringViewType> {
+ using ArrayType = StringViewArray;
+ using BuilderType = StringViewBuilder;
+ using ScalarType = StringViewScalar;
+ using CType = StringHeader;
+ constexpr static bool is_parameter_free = true;
+ static inline std::shared_ptr<DataType> type_singleton() { return utf8_view(); }
+};
+
template <>
struct TypeTraits<LargeStringType> {
using ArrayType = LargeStringArray;
@@ -390,6 +410,11 @@ struct CTypeTraits<std::string> : public TypeTraits<StringType> {
using ArrowType = StringType;
};
+template <>
+struct CTypeTraits<StringHeader> : public TypeTraits<BinaryViewType> {
+ using ArrowType = BinaryViewType;
+};
+
template <>
struct CTypeTraits<const char*> : public CTypeTraits<std::string> {};
@@ -605,9 +630,28 @@ using is_string_type =
template <typename T, typename R = void>
using enable_if_string = enable_if_t<is_string_type<T>::value, R>;
+template <typename T>
+using is_binary_view_like_type = std::is_base_of<BinaryViewType, T>;
+
+template <typename T>
+using is_binary_view_type = std::is_same<BinaryViewType, T>;
+
+template <typename T>
+using is_string_view_type = std::is_same<StringViewType, T>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view_like = enable_if_t<is_binary_view_like_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view = enable_if_t<is_binary_view_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_string_view = enable_if_t<is_string_view_type<T>::value, R>;
+
template <typename T>
using is_string_like_type =
- std::integral_constant<bool, is_base_binary_type<T>::value && T::is_utf8>;
+ std::integral_constant<bool, (is_base_binary_type<T>::value && T::is_utf8) ||
+ is_string_view_type<T>::value>;
template <typename T, typename R = void>
using enable_if_string_like = enable_if_t<is_string_like_type<T>::value, R>;
@@ -630,10 +674,9 @@ template <typename T, typename R = void>
using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, R>;
template <typename T>
-using is_binary_like_type =
- std::integral_constant<bool, (is_base_binary_type<T>::value &&
- !is_string_like_type<T>::value) ||
- is_fixed_size_binary_type<T>::value>;
+using is_binary_like_type = std::integral_constant<
+ bool, (is_base_binary_type<T>::value && !is_string_like_type<T>::value) ||
+ is_binary_view_type<T>::value || is_fixed_size_binary_type<T>::value>;
template <typename T, typename R = void>
using enable_if_binary_like = enable_if_t<is_binary_like_type<T>::value, R>;
@@ -786,8 +829,10 @@ using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>;
template <typename T>
using has_string_view =
std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
- std::is_same<LargeBinaryType, T>::value ||
+ std::is_same<BinaryViewType, T>::value ||
+ std::is_same<LargeBinaryType, T>::value ||
std::is_same<StringType, T>::value ||
+ std::is_same<StringViewType, T>::value ||
std::is_same<LargeStringType, T>::value ||
std::is_same<FixedSizeBinaryType, T>::value>;
diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
new file mode 100644
index 0000000000..29f378a580
--- /dev/null
+++ b/cpp/src/arrow/util/string_header.h
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <string_view>
+
+namespace arrow {
+
+// Variable length string or binary with 4 byte prefix and inline optimization
+// for small values (12 bytes or fewer). This is similar to std::string_view
+// except that the referenced is limited in size to UINT32_MAX and up to the
+// first four bytes of the string are copied into the struct. The prefix allows
+// failing comparisons early and can reduce the CPU cache working set when
+// dealing with short strings.
+//
+// Short string |----|----|--------|
+// ^ ^ ^
+// | | |
+// size prefix remaining in-line portion
+//
+// Long string |----|----|--------|
+// ^ ^ ^
+// | | |
+// size prefix pointer to out-of-line portion
+//
+// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+//
+// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+struct StringHeader {
+ public:
+ using value_type = char;
+
+ static constexpr size_t kPrefixSize = 4;
+ static constexpr size_t kInlineSize = 12;
+
+ StringHeader() {
+ static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
+ ;
+ memset(this, 0, sizeof(StringHeader));
+ }
+
+ explicit StringHeader(uint32_t size) : size_(size) {
+ memset(prefix_, 0, kPrefixSize);
+ value_.data = nullptr;
+ }
+
+ StringHeader(const char* data, size_t len) : size_(len) {
+ // TODO: better option than assert?
+ assert(data || size_ == 0);
+ if (IsInline()) {
+ // Zero the inline part.
+ // this makes sure that inline strings can be compared for equality with 2
+ // int64 compares.
+ memset(prefix_, 0, kPrefixSize);
+ if (size_ == 0) {
+ return;
+ }
+ // small string: inlined. Zero the last 8 bytes first to allow for whole
+ // word comparison.
+ value_.data = nullptr;
+ memcpy(prefix_, data, size_);
+ } else {
+ // large string: store pointer
+ memcpy(prefix_, data, kPrefixSize);
+ value_.data = data;
+ }
+ }
+
+ StringHeader(const uint8_t* data, int64_t len)
+ : StringHeader(reinterpret_cast<const char*>(data), static_cast<size_t>(len)) {}
+
+ // Making StringHeader implicitly constructible/convertible from char* and
+ // string literals, in order to allow for a more flexible API and optional
+ // interoperability. E.g:
+ //
+ // StringHeader bh = "literal";
+ // std::optional<BytesView> obh = "literal";
+ //
+ /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
+
+ explicit StringHeader(const std::string& value)
+ : StringHeader(value.data(), value.size()) {}
+
+ explicit StringHeader(const std::string_view& value)
+ : StringHeader(value.data(), value.size()) {}
+
+ bool IsInline() const { return IsInline(size_); }
+
+ static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
+
+ const char* data() const { return IsInline() ? prefix_ : value_.data; }
+
+ size_t size() const { return size_; }
+
+ size_t capacity() const { return size_; }
+
+ friend std::ostream& operator<<(std::ostream& os, const StringHeader& header) {
+ os.write(header.data(), header.size());
+ return os;
+ }
+
+ bool operator==(const StringHeader& other) const {
+ // Compare lengths and first 4 characters.
+ if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) {
+ return false;
+ }
+ if (IsInline()) {
+ // The inline part is zeroed at construction, so we can compare
+ // a word at a time if data extends past 'prefix_'.
+ return size_ <= kPrefixSize || InlinedAsInt64() == other.InlinedAsInt64();
+ }
+ // Sizes are equal and this is not inline, therefore both are out
+ // of line and have kPrefixSize first in common.
+ return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize,
+ size_ - kPrefixSize) == 0;
+ }
+
+ bool operator!=(const StringHeader& other) const { return !(*this == other); }
+
+ // Returns 0, if this == other
+ // < 0, if this < other
+ // > 0, if this > other
+ int32_t Compare(const StringHeader& other) const {
+ if (PrefixAsInt() != other.PrefixAsInt()) {
+ // The result is decided on prefix. The shorter will be less
+ // because the prefix is padded with zeros.
+ return memcmp(prefix_, other.prefix_, kPrefixSize);
+ }
+ int32_t size = std::min(size_, other.size_) - kPrefixSize;
+ if (size <= 0) {
+ // One ends within the prefix.
+ return size_ - other.size_;
+ }
+ if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) {
+ int32_t result = memcmp(value_.inlined, other.value_.inlined, size);
+ return (result != 0) ? result : size_ - other.size_;
+ }
+ int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size);
+ return (result != 0) ? result : size_ - other.size_;
+ }
+
+ bool operator<(const StringHeader& other) const { return Compare(other) < 0; }
+
+ bool operator<=(const StringHeader& other) const { return Compare(other) <= 0; }
+
+ bool operator>(const StringHeader& other) const { return Compare(other) > 0; }
+
+ bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; }
+
+ operator std::string() const { return std::string(data(), size()); }
+
+ std::string GetString() const { return *this; }
+
+ explicit operator std::string_view() const { return std::string_view(data(), size()); }
+
+ const char* begin() const { return data(); }
+
+ const char* end() const { return data() + size(); }
+
+ bool empty() const { return size() == 0; }
+
+ private:
+ inline int64_t SizeAndPrefixAsInt64() const {
+ return reinterpret_cast<const int64_t*>(this)[0];
+ }
+
+ inline int64_t InlinedAsInt64() const {
+ return reinterpret_cast<const int64_t*>(this)[1];
+ }
+
+ int32_t PrefixAsInt() const { return *reinterpret_cast<const int32_t*>(&prefix_); }
+
+ // We rely on all members being laid out top to bottom . C++
+ // guarantees this.
+ uint32_t size_;
+ char prefix_[4];
+ union {
+ char inlined[8];
+ const char* data;
+ } value_;
+};
+
+} // namespace arrow
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index d22efc942e..03381a08a7 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array)
ARRAY_VISITOR_DEFAULT(HalfFloatArray)
ARRAY_VISITOR_DEFAULT(FloatArray)
ARRAY_VISITOR_DEFAULT(DoubleArray)
-ARRAY_VISITOR_DEFAULT(BinaryArray)
ARRAY_VISITOR_DEFAULT(StringArray)
+ARRAY_VISITOR_DEFAULT(StringViewArray)
+ARRAY_VISITOR_DEFAULT(BinaryArray)
+ARRAY_VISITOR_DEFAULT(BinaryViewArray)
ARRAY_VISITOR_DEFAULT(LargeBinaryArray)
ARRAY_VISITOR_DEFAULT(LargeStringArray)
ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray)
@@ -95,7 +97,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType)
TYPE_VISITOR_DEFAULT(FloatType)
TYPE_VISITOR_DEFAULT(DoubleType)
TYPE_VISITOR_DEFAULT(StringType)
+TYPE_VISITOR_DEFAULT(StringViewType)
TYPE_VISITOR_DEFAULT(BinaryType)
+TYPE_VISITOR_DEFAULT(BinaryViewType)
TYPE_VISITOR_DEFAULT(LargeStringType)
TYPE_VISITOR_DEFAULT(LargeBinaryType)
TYPE_VISITOR_DEFAULT(FixedSizeBinaryType)
@@ -145,7 +149,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar)
SCALAR_VISITOR_DEFAULT(FloatScalar)
SCALAR_VISITOR_DEFAULT(DoubleScalar)
SCALAR_VISITOR_DEFAULT(StringScalar)
+SCALAR_VISITOR_DEFAULT(StringViewScalar)
SCALAR_VISITOR_DEFAULT(BinaryScalar)
+SCALAR_VISITOR_DEFAULT(BinaryViewScalar)
SCALAR_VISITOR_DEFAULT(LargeStringScalar)
SCALAR_VISITOR_DEFAULT(LargeBinaryScalar)
SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar)
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index 7f83c9ebab..58330de9d0 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const FloatArray& array);
virtual Status Visit(const DoubleArray& array);
virtual Status Visit(const StringArray& array);
+ virtual Status Visit(const StringViewArray& array);
virtual Status Visit(const BinaryArray& array);
+ virtual Status Visit(const BinaryViewArray& array);
virtual Status Visit(const LargeStringArray& array);
virtual Status Visit(const LargeBinaryArray& array);
virtual Status Visit(const FixedSizeBinaryArray& array);
@@ -93,7 +95,9 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const FloatType& type);
virtual Status Visit(const DoubleType& type);
virtual Status Visit(const StringType& type);
+ virtual Status Visit(const StringViewType& type);
virtual Status Visit(const BinaryType& type);
+ virtual Status Visit(const BinaryViewType& type);
virtual Status Visit(const LargeStringType& type);
virtual Status Visit(const LargeBinaryType& type);
virtual Status Visit(const FixedSizeBinaryType& type);
@@ -141,7 +145,9 @@ class ARROW_EXPORT ScalarVisitor {
virtual Status Visit(const FloatScalar& scalar);
virtual Status Visit(const DoubleScalar& scalar);
virtual Status Visit(const StringScalar& scalar);
+ virtual Status Visit(const StringViewScalar& scalar);
virtual Status Visit(const BinaryScalar& scalar);
+ virtual Status Visit(const BinaryViewScalar& scalar);
virtual Status Visit(const LargeStringScalar& scalar);
virtual Status Visit(const LargeBinaryScalar& scalar);
virtual Status Visit(const FixedSizeBinaryScalar& scalar);
diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h
index 265c76197a..2c267576ca 100644
--- a/cpp/src/arrow/visitor_generate.h
+++ b/cpp/src/arrow/visitor_generate.h
@@ -40,7 +40,9 @@ namespace arrow {
ACTION(Boolean); \
ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
ACTION(String); \
+ ACTION(StringView); \
ACTION(Binary); \
+ ACTION(BinaryView); \
ACTION(LargeString); \
ACTION(LargeBinary); \
ACTION(FixedSizeBinary); \
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f7898c02d4..e62e34abb0 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -129,6 +129,7 @@ struct ValueBufferSlicer {
NOT_IMPLEMENTED_VISIT(FixedSizeList);
NOT_IMPLEMENTED_VISIT(Dictionary);
NOT_IMPLEMENTED_VISIT(Extension);
+ NOT_IMPLEMENTED_VISIT(BinaryView);
#undef NOT_IMPLEMENTED_VISIT
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f3cee6c65e..7e48f09889 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -116,39 +116,21 @@ void BufferCapsule_Destructor(PyObject* capsule) {
using internal::arrow_traits;
using internal::npy_traits;
-template <typename T>
+template <typename T, typename Enable = void>
struct WrapBytes {};
-template <>
-struct WrapBytes<StringType> {
- static inline PyObject* Wrap(const char* data, int64_t length) {
- return PyUnicode_FromStringAndSize(data, length);
- }
-};
-
-template <>
-struct WrapBytes<LargeStringType> {
+template <typename T>
+struct WrapBytes<T, enable_if_t<is_string_type<T>::value ||
+ is_string_view_type<T>::value>> {
static inline PyObject* Wrap(const char* data, int64_t length) {
return PyUnicode_FromStringAndSize(data, length);
}
};
-template <>
-struct WrapBytes<BinaryType> {
- static inline PyObject* Wrap(const char* data, int64_t length) {
- return PyBytes_FromStringAndSize(data, length);
- }
-};
-
-template <>
-struct WrapBytes<LargeBinaryType> {
- static inline PyObject* Wrap(const char* data, int64_t length) {
- return PyBytes_FromStringAndSize(data, length);
- }
-};
-
-template <>
-struct WrapBytes<FixedSizeBinaryType> {
+template <typename T>
+struct WrapBytes<T, enable_if_t<is_binary_type<T>::value ||
+ is_binary_view_type<T>::value ||
+ is_fixed_size_binary_type<T>::value>> {
static inline PyObject* Wrap(const char* data, int64_t length) {
return PyBytes_FromStringAndSize(data, length);
}
@@ -1026,7 +1008,9 @@ struct ObjectWriterVisitor {
}
template <typename Type>
- enable_if_t<is_base_binary_type<Type>::value || is_fixed_size_binary_type<Type>::value,
+ enable_if_t<is_base_binary_type<Type>::value ||
+ is_binary_view_like_type<Type>::value ||
+ is_fixed_size_binary_type<Type>::value,
Status>
Visit(const Type& type) {
auto WrapValue = [](const std::string_view& view, PyObject** out) {
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 9e7f07ef81..3ffff8cf19 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -479,13 +479,17 @@ class PyValue {
// The binary-like intermediate representation is PyBytesView because it keeps temporary
// python objects alive (non-contiguous memoryview) and stores whether the original
- // object was unicode encoded or not, which is used for unicode -> bytes coersion if
+ // object was unicode encoded or not, which is used for unicode -> bytes coercion if
// there is a non-unicode object observed.
static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) {
return view.ParseString(obj);
}
+ static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) {
+ return view.ParseString(obj);
+ }
+
static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
@@ -672,12 +676,9 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::
PyBytesView view_;
};
-template <typename T>
-class PyPrimitiveConverter<T, enable_if_base_binary<T>>
- : public PrimitiveConverter<T, PyConverter> {
+template <typename T, typename OffsetType>
+class PyBinaryConverter : public PrimitiveConverter<T, PyConverter> {
public:
- using OffsetType = typename T::offset_type;
-
Status Append(PyObject* value) override {
if (PyValue::IsNull(this->options_, value)) {
this->primitive_builder_->UnsafeAppendNull();
@@ -701,7 +702,7 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
Result<std::shared_ptr<Array>> ToArray() override {
ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
if (observed_binary_) {
- // if we saw any non-unicode, cast results to BinaryArray
+ // if we saw any non-unicode, cast results to BinaryArray/BinaryViewArray
auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
return array->View(binary_type);
} else {
@@ -714,6 +715,14 @@ class PyPrimitiveConverter<T, enable_if_base_binary<T>>
bool observed_binary_ = false;
};
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_base_binary<T>>
+ : public PyBinaryConverter<T, typename T::offset_type> {};
+
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_binary_view_like<T>>
+ : public PyBinaryConverter<T, int64_t> {};
+
template <typename U>
class PyDictionaryConverter<U, enable_if_has_c_type<U>>
: public DictionaryConverter<U, PyConverter> {