You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:53 UTC
[arrow] 09/15: fixes in substrait, rename in LICENSE, owning scalars
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 94fcb95927b1edc0c9805b674e4df6c5a31f9277
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Nov 15 14:36:15 2022 -0500
fixes in substrait, rename in LICENSE, owning scalars
---
LICENSE.txt | 2 +-
cpp/src/arrow/array/builder_binary.h | 3 +-
.../arrow/engine/substrait/expression_internal.cc | 9 ++++
cpp/src/arrow/engine/substrait/type_internal.cc | 7 +++
cpp/src/arrow/scalar.cc | 9 ----
cpp/src/arrow/scalar.h | 32 ++++--------
cpp/src/arrow/util/string_header.h | 60 ++++++++++------------
7 files changed, 55 insertions(+), 67 deletions(-)
diff --git a/LICENSE.txt b/LICENSE.txt
index d282bfe7b3..02ac840980 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -2049,7 +2049,7 @@ License: http://www.apache.org/licenses/LICENSE-2.0
This project includes code from Velox.
- * cpp/src/arrow/util/bytes_header.h
+ * cpp/src/arrow/util/string_header.h
is based on Velox's
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index ca5209b81e..b9d926cb16 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -504,8 +504,7 @@ class ARROW_EXPORT StringHeapBuilder {
/// UnsafeAppend operations without the need to allocate more memory
Status Reserve(int64_t num_bytes) {
if (num_bytes > current_remaining_bytes_) {
- current_remaining_bytes_ =
- num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize;
+ current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
AllocateBuffer(current_remaining_bytes_, pool_));
current_out_buffer_ = new_block->mutable_data();
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc
index b8b545febc..19ec354cf6 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -606,6 +606,15 @@ struct ScalarToProtoImpl {
s);
}
+ Status Visit(const StringViewScalar& s) {
+ return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); },
+ s);
+ }
+ Status Visit(const BinaryViewScalar& s) {
+ return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); },
+ s);
+ }
+
Status Visit(const FixedSizeBinaryScalar& s) {
return FromBuffer(
[](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s);
diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc
index 16032df67d..e6b0b41a14 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.cc
+++ b/cpp/src/arrow/engine/substrait/type_internal.cc
@@ -256,6 +256,13 @@ struct DataTypeToProtoImpl {
return SetWith(&::substrait::Type::set_allocated_binary);
}
+ Status Visit(const StringViewType& t) {
+ return SetWith(&::substrait::Type::set_allocated_string);
+ }
+ Status Visit(const BinaryViewType& t) {
+ return SetWith(&::substrait::Type::set_allocated_binary);
+ }
+
Status Visit(const FixedSizeBinaryType& t) {
SetWithThen(&::substrait::Type::set_allocated_fixed_binary)
->set_length(t.byte_width());
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index d139845bd7..bfe8a49a9e 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -70,12 +70,6 @@ struct ScalarHashImpl {
Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
- Status Visit(const BinaryViewScalar& s) {
- const StringHeader& v = s.value;
- hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size());
- return Status::OK();
- }
-
template <typename T>
Status Visit(const TemporalScalar<T>& s) {
return ValueHash(s);
@@ -508,9 +502,6 @@ Status Scalar::ValidateFull() const {
BinaryScalar::BinaryScalar(std::string s)
: BinaryScalar(Buffer::FromString(std::move(s))) {}
-StringScalar::StringScalar(std::string s)
- : StringScalar(Buffer::FromString(std::move(s))) {}
-
LargeBinaryScalar::LargeBinaryScalar(std::string s)
: LargeBinaryScalar(Buffer::FromString(std::move(s))) {}
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 9b7f604132..9f41ad0975 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -251,7 +251,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
return value ? std::string_view(*value) : std::string_view();
}
- protected:
BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
: internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
};
@@ -260,9 +259,6 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
using BaseBinaryScalar::BaseBinaryScalar;
using TypeClass = BinaryType;
- BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
- : BaseBinaryScalar(std::move(value), std::move(type)) {}
-
explicit BinaryScalar(std::shared_ptr<Buffer> value)
: BinaryScalar(std::move(value), binary()) {}
@@ -278,37 +274,29 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
explicit StringScalar(std::shared_ptr<Buffer> value)
: StringScalar(std::move(value), utf8()) {}
- explicit StringScalar(std::string s);
-
StringScalar() : StringScalar(utf8()) {}
};
-struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase {
- using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar {
+ using BaseBinaryScalar::BaseBinaryScalar;
using TypeClass = BinaryViewType;
- explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type)
- : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
-
- explicit BinaryViewScalar(StringHeader value)
- : BinaryViewScalar(value, binary_view()) {}
-
- BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {}
-
- void* mutable_data() override { return reinterpret_cast<void*>(&this->value); }
+ explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
+ : BinaryViewScalar(std::move(value), binary_view()) {}
- std::string_view view() const override { return std::string_view(this->value); }
+ BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
- StringHeader value;
+ std::string_view view() const override { return std::string_view(*this->value); }
};
struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+ using BinaryViewScalar::BinaryViewScalar;
using TypeClass = StringViewType;
- explicit StringViewScalar(StringHeader value)
- : BinaryViewScalar(std::move(value), utf8_view()) {}
+ explicit StringViewScalar(std::shared_ptr<Buffer> value)
+ : StringViewScalar(std::move(value), utf8_view()) {}
- StringViewScalar() : BinaryViewScalar(utf8_view()) {}
+ StringViewScalar() : StringViewScalar(utf8_view()) {}
};
struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
index 29f378a580..8ba18a8366 100644
--- a/cpp/src/arrow/util/string_header.h
+++ b/cpp/src/arrow/util/string_header.h
@@ -33,6 +33,7 @@
#pragma once
+#include <array>
#include <cassert>
#include <cstdint>
#include <cstring>
@@ -69,35 +70,27 @@ struct StringHeader {
static constexpr size_t kPrefixSize = 4;
static constexpr size_t kInlineSize = 12;
- StringHeader() {
- static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
- ;
- memset(this, 0, sizeof(StringHeader));
- }
+ StringHeader() = default;
- explicit StringHeader(uint32_t size) : size_(size) {
- memset(prefix_, 0, kPrefixSize);
- value_.data = nullptr;
+ static StringHeader makeInline(uint32_t size, char** data) {
+ assert(size <= kInlineSize);
+ StringHeader s;
+ s.size_ = size;
+ *data = const_cast<char*>(s.data());
+ return s;
}
- StringHeader(const char* data, size_t len) : size_(len) {
+ StringHeader(const char* data, size_t len) : size_(static_cast<uint32_t>(len)) {
+ if (size_ == 0) return;
+
// TODO: better option than assert?
- assert(data || size_ == 0);
+ assert(data);
if (IsInline()) {
- // Zero the inline part.
- // this makes sure that inline strings can be compared for equality with 2
- // int64 compares.
- memset(prefix_, 0, kPrefixSize);
- if (size_ == 0) {
- return;
- }
- // small string: inlined. Zero the last 8 bytes first to allow for whole
- // word comparison.
- value_.data = nullptr;
- memcpy(prefix_, data, size_);
+ // small string: inlined. Bytes beyond size_ are already 0
+ memcpy(prefix_.data(), data, size_);
} else {
// large string: store pointer
- memcpy(prefix_, data, kPrefixSize);
+ memcpy(prefix_.data(), data, kPrefixSize);
value_.data = data;
}
}
@@ -112,19 +105,20 @@ struct StringHeader {
// StringHeader bh = "literal";
// std::optional<BytesView> obh = "literal";
//
- /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
+ // NOLINTNEXTLINE runtime/explicit
+ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
explicit StringHeader(const std::string& value)
: StringHeader(value.data(), value.size()) {}
- explicit StringHeader(const std::string_view& value)
+ explicit StringHeader(std::string_view value)
: StringHeader(value.data(), value.size()) {}
bool IsInline() const { return IsInline(size_); }
static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
- const char* data() const { return IsInline() ? prefix_ : value_.data; }
+ const char* data() const { return IsInline() ? prefix_.data() : value_.data; }
size_t size() const { return size_; }
@@ -160,7 +154,7 @@ struct StringHeader {
if (PrefixAsInt() != other.PrefixAsInt()) {
// The result is decided on prefix. The shorter will be less
// because the prefix is padded with zeros.
- return memcmp(prefix_, other.prefix_, kPrefixSize);
+ return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize);
}
int32_t size = std::min(size_, other.size_) - kPrefixSize;
if (size <= 0) {
@@ -168,7 +162,7 @@ struct StringHeader {
return size_ - other.size_;
}
if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) {
- int32_t result = memcmp(value_.inlined, other.value_.inlined, size);
+ int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size);
return (result != 0) ? result : size_ - other.size_;
}
int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size);
@@ -183,9 +177,7 @@ struct StringHeader {
bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; }
- operator std::string() const { return std::string(data(), size()); }
-
- std::string GetString() const { return *this; }
+ std::string GetString() const { return std::string(data(), size()); }
explicit operator std::string_view() const { return std::string_view(data(), size()); }
@@ -208,12 +200,14 @@ struct StringHeader {
// We rely on all members being laid out top to bottom . C++
// guarantees this.
- uint32_t size_;
- char prefix_[4];
+ uint32_t size_ = 0;
+ std::array<char, 4> prefix_ = {0};
union {
- char inlined[8];
+ std::array<char, 8> inlined = {0};
const char* data;
} value_;
};
+static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
+
} // namespace arrow