You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:53 UTC

[arrow] 09/15: fixes in substrait, rename in LICENSE, owning scalars

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 94fcb95927b1edc0c9805b674e4df6c5a31f9277
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Tue Nov 15 14:36:15 2022 -0500

    fixes in substrait, rename in LICENSE, owning scalars
---
 LICENSE.txt                                        |  2 +-
 cpp/src/arrow/array/builder_binary.h               |  3 +-
 .../arrow/engine/substrait/expression_internal.cc  |  9 ++++
 cpp/src/arrow/engine/substrait/type_internal.cc    |  7 +++
 cpp/src/arrow/scalar.cc                            |  9 ----
 cpp/src/arrow/scalar.h                             | 32 ++++--------
 cpp/src/arrow/util/string_header.h                 | 60 ++++++++++------------
 7 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index d282bfe7b3..02ac840980 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -2049,7 +2049,7 @@ License: http://www.apache.org/licenses/LICENSE-2.0
 
 This project includes code from Velox.
 
- * cpp/src/arrow/util/bytes_header.h
+ * cpp/src/arrow/util/string_header.h
 
 is based on Velox's
 
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index ca5209b81e..b9d926cb16 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -504,8 +504,7 @@ class ARROW_EXPORT StringHeapBuilder {
   /// UnsafeAppend operations without the need to allocate more memory
   Status Reserve(int64_t num_bytes) {
     if (num_bytes > current_remaining_bytes_) {
-      current_remaining_bytes_ =
-          num_bytes > kDefaultBlocksize ? num_bytes : kDefaultBlocksize;
+      current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> new_block,
                             AllocateBuffer(current_remaining_bytes_, pool_));
       current_out_buffer_ = new_block->mutable_data();
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc
index b8b545febc..19ec354cf6 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -606,6 +606,15 @@ struct ScalarToProtoImpl {
                       s);
   }
 
+  Status Visit(const StringViewScalar& s) {
+    return FromBuffer([](Lit* lit, std::string&& s) { lit->set_string(std::move(s)); },
+                      s);
+  }
+  Status Visit(const BinaryViewScalar& s) {
+    return FromBuffer([](Lit* lit, std::string&& s) { lit->set_binary(std::move(s)); },
+                      s);
+  }
+
   Status Visit(const FixedSizeBinaryScalar& s) {
     return FromBuffer(
         [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s);
diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc
index 16032df67d..e6b0b41a14 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.cc
+++ b/cpp/src/arrow/engine/substrait/type_internal.cc
@@ -256,6 +256,13 @@ struct DataTypeToProtoImpl {
     return SetWith(&::substrait::Type::set_allocated_binary);
   }
 
+  Status Visit(const StringViewType& t) {
+    return SetWith(&::substrait::Type::set_allocated_string);
+  }
+  Status Visit(const BinaryViewType& t) {
+    return SetWith(&::substrait::Type::set_allocated_binary);
+  }
+
   Status Visit(const FixedSizeBinaryType& t) {
     SetWithThen(&::substrait::Type::set_allocated_fixed_binary)
         ->set_length(t.byte_width());
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index d139845bd7..bfe8a49a9e 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -70,12 +70,6 @@ struct ScalarHashImpl {
 
   Status Visit(const BaseBinaryScalar& s) { return BufferHash(*s.value); }
 
-  Status Visit(const BinaryViewScalar& s) {
-    const StringHeader& v = s.value;
-    hash_ ^= internal::ComputeStringHash<1>(v.data(), v.size());
-    return Status::OK();
-  }
-
   template <typename T>
   Status Visit(const TemporalScalar<T>& s) {
     return ValueHash(s);
@@ -508,9 +502,6 @@ Status Scalar::ValidateFull() const {
 BinaryScalar::BinaryScalar(std::string s)
     : BinaryScalar(Buffer::FromString(std::move(s))) {}
 
-StringScalar::StringScalar(std::string s)
-    : StringScalar(Buffer::FromString(std::move(s))) {}
-
 LargeBinaryScalar::LargeBinaryScalar(std::string s)
     : LargeBinaryScalar(Buffer::FromString(std::move(s))) {}
 
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 9b7f604132..9f41ad0975 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -251,7 +251,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
     return value ? std::string_view(*value) : std::string_view();
   }
 
- protected:
   BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
 };
@@ -260,9 +259,6 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar {
   using BaseBinaryScalar::BaseBinaryScalar;
   using TypeClass = BinaryType;
 
-  BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
-      : BaseBinaryScalar(std::move(value), std::move(type)) {}
-
   explicit BinaryScalar(std::shared_ptr<Buffer> value)
       : BinaryScalar(std::move(value), binary()) {}
 
@@ -278,37 +274,29 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar {
   explicit StringScalar(std::shared_ptr<Buffer> value)
       : StringScalar(std::move(value), utf8()) {}
 
-  explicit StringScalar(std::string s);
-
   StringScalar() : StringScalar(utf8()) {}
 };
 
-struct ARROW_EXPORT BinaryViewScalar : public internal::PrimitiveScalarBase {
-  using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar {
+  using BaseBinaryScalar::BaseBinaryScalar;
   using TypeClass = BinaryViewType;
 
-  explicit BinaryViewScalar(StringHeader value, std::shared_ptr<DataType> type)
-      : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
-
-  explicit BinaryViewScalar(StringHeader value)
-      : BinaryViewScalar(value, binary_view()) {}
-
-  BinaryViewScalar() : internal::PrimitiveScalarBase(binary_view(), false) {}
-
-  void* mutable_data() override { return reinterpret_cast<void*>(&this->value); }
+  explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
+      : BinaryViewScalar(std::move(value), binary_view()) {}
 
-  std::string_view view() const override { return std::string_view(this->value); }
+  BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
 
-  StringHeader value;
+  std::string_view view() const override { return std::string_view(*this->value); }
 };
 
 struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+  using BinaryViewScalar::BinaryViewScalar;
   using TypeClass = StringViewType;
 
-  explicit StringViewScalar(StringHeader value)
-      : BinaryViewScalar(std::move(value), utf8_view()) {}
+  explicit StringViewScalar(std::shared_ptr<Buffer> value)
+      : StringViewScalar(std::move(value), utf8_view()) {}
 
-  StringViewScalar() : BinaryViewScalar(utf8_view()) {}
+  StringViewScalar() : StringViewScalar(utf8_view()) {}
 };
 
 struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar {
diff --git a/cpp/src/arrow/util/string_header.h b/cpp/src/arrow/util/string_header.h
index 29f378a580..8ba18a8366 100644
--- a/cpp/src/arrow/util/string_header.h
+++ b/cpp/src/arrow/util/string_header.h
@@ -33,6 +33,7 @@
 
 #pragma once
 
+#include <array>
 #include <cassert>
 #include <cstdint>
 #include <cstring>
@@ -69,35 +70,27 @@ struct StringHeader {
   static constexpr size_t kPrefixSize = 4;
   static constexpr size_t kInlineSize = 12;
 
-  StringHeader() {
-    static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
-    ;
-    memset(this, 0, sizeof(StringHeader));
-  }
+  StringHeader() = default;
 
-  explicit StringHeader(uint32_t size) : size_(size) {
-    memset(prefix_, 0, kPrefixSize);
-    value_.data = nullptr;
+  static StringHeader makeInline(uint32_t size, char** data) {
+    assert(size <= kInlineSize);
+    StringHeader s;
+    s.size_ = size;
+    *data = const_cast<char*>(s.data());
+    return s;
   }
 
-  StringHeader(const char* data, size_t len) : size_(len) {
+  StringHeader(const char* data, size_t len) : size_(static_cast<uint32_t>(len)) {
+    if (size_ == 0) return;
+
     // TODO: better option than assert?
-    assert(data || size_ == 0);
+    assert(data);
     if (IsInline()) {
-      // Zero the inline part.
-      // this makes sure that inline strings can be compared for equality with 2
-      // int64 compares.
-      memset(prefix_, 0, kPrefixSize);
-      if (size_ == 0) {
-        return;
-      }
-      // small string: inlined. Zero the last 8 bytes first to allow for whole
-      // word comparison.
-      value_.data = nullptr;
-      memcpy(prefix_, data, size_);
+      // small string: inlined. Bytes beyond size_ are already 0
+      memcpy(prefix_.data(), data, size_);
     } else {
       // large string: store pointer
-      memcpy(prefix_, data, kPrefixSize);
+      memcpy(prefix_.data(), data, kPrefixSize);
       value_.data = data;
     }
   }
@@ -112,19 +105,20 @@ struct StringHeader {
   //   StringHeader bh = "literal";
   //   std::optional<BytesView> obh = "literal";
   //
-  /* implicit */ StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
+  // NOLINTNEXTLINE runtime/explicit
+  StringHeader(const char* data) : StringHeader(data, strlen(data)) {}
 
   explicit StringHeader(const std::string& value)
       : StringHeader(value.data(), value.size()) {}
 
-  explicit StringHeader(const std::string_view& value)
+  explicit StringHeader(std::string_view value)
       : StringHeader(value.data(), value.size()) {}
 
   bool IsInline() const { return IsInline(size_); }
 
   static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
 
-  const char* data() const { return IsInline() ? prefix_ : value_.data; }
+  const char* data() const { return IsInline() ? prefix_.data() : value_.data; }
 
   size_t size() const { return size_; }
 
@@ -160,7 +154,7 @@ struct StringHeader {
     if (PrefixAsInt() != other.PrefixAsInt()) {
       // The result is decided on prefix. The shorter will be less
       // because the prefix is padded with zeros.
-      return memcmp(prefix_, other.prefix_, kPrefixSize);
+      return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize);
     }
     int32_t size = std::min(size_, other.size_) - kPrefixSize;
     if (size <= 0) {
@@ -168,7 +162,7 @@ struct StringHeader {
       return size_ - other.size_;
     }
     if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && other.IsInline()) {
-      int32_t result = memcmp(value_.inlined, other.value_.inlined, size);
+      int32_t result = memcmp(value_.inlined.data(), other.value_.inlined.data(), size);
       return (result != 0) ? result : size_ - other.size_;
     }
     int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, size);
@@ -183,9 +177,7 @@ struct StringHeader {
 
   bool operator>=(const StringHeader& other) const { return Compare(other) >= 0; }
 
-  operator std::string() const { return std::string(data(), size()); }
-
-  std::string GetString() const { return *this; }
+  std::string GetString() const { return std::string(data(), size()); }
 
   explicit operator std::string_view() const { return std::string_view(data(), size()); }
 
@@ -208,12 +200,14 @@ struct StringHeader {
 
   // We rely on all members being laid out top to bottom . C++
   // guarantees this.
-  uint32_t size_;
-  char prefix_[4];
+  uint32_t size_ = 0;
+  std::array<char, 4> prefix_ = {0};
   union {
-    char inlined[8];
+    std::array<char, 8> inlined = {0};
     const char* data;
   } value_;
 };
 
+static_assert(sizeof(StringHeader) == 16, "struct expected by exactly 16 bytes");
+
 }  // namespace arrow