You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by bk...@apache.org on 2022/11/30 14:50:56 UTC
[arrow] 12/15: Adding comparison and concatenation
This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch feature/format-string-view
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 04893f65e92f57bba7b8ff0bbc201dfd17ff3aa0
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Nov 18 17:04:47 2022 -0500
Adding comparison and concatenation
---
cpp/src/arrow/array/builder_binary.h | 12 ++++++++----
cpp/src/arrow/array/concatenate.cc | 24 +++++++++++++++++++++++-
cpp/src/arrow/array/concatenate_test.cc | 8 ++++++++
cpp/src/arrow/compare.cc | 8 +++++++-
cpp/src/arrow/testing/random.cc | 14 +++++++++++---
cpp/src/arrow/testing/random.h | 16 ++++++++++++++++
6 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 30ab4b9d4a..ccfcb8b2b2 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -542,9 +542,16 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
public:
using TypeClass = BinaryViewType;
- BinaryViewBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ // this constructor provided for MakeBuilder compatibility
+ BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool)
: BinaryViewBuilder(pool) {}
+ explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment)
+ : ArrayBuilder(pool, alignment),
+ data_builder_(pool, alignment),
+ data_heap_builder_(pool) {}
+
int64_t current_block_bytes_remaining() const {
return data_heap_builder_.current_remaining_bytes();
}
@@ -683,9 +690,6 @@ class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
std::shared_ptr<DataType> type() const override { return binary_view(); }
protected:
- explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), data_builder_(pool), data_heap_builder_(pool) {}
-
static constexpr int64_t ValueSizeLimit() {
return std::numeric_limits<uint32_t>::max();
}
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index 3dd0ccea93..6f7d61283e 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -228,7 +228,29 @@ class ConcatenateImpl {
}
Status Visit(const BinaryViewType&) {
- return Status::NotImplemented("binary / string view");
+ bool any_opted_out_of_view_validation = false;
+ out_->buffers.resize(2);
+
+ for (const auto& in_data : in_) {
+ auto begin = in_data->buffers.begin() + 2;
+ auto end = in_data->buffers.end();
+
+ if (BinaryViewArray::OptedOutOfViewValidation(*in_data)) {
+ any_opted_out_of_view_validation = true;
+ --end;
+ }
+
+ for (auto it = begin; it != end; ++it) {
+ out_->buffers.push_back(*it);
+ }
+ }
+
+ if (any_opted_out_of_view_validation) {
+ out_->buffers = BinaryViewArray::DoNotValidateViews(std::move(out_->buffers));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, sizeof(StringHeader)));
+ return ConcatenateBuffers(header_buffers, pool_).Value(&out_->buffers[1]);
}
Status Visit(const ListType&) {
diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index bff5d7eec1..1bc0c65bec 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -91,6 +91,7 @@ class ConcatenateTest : public ::testing::Test {
for (auto null_probability : this->null_probabilities_) {
std::shared_ptr<Array> array;
factory(size, null_probability, &array);
+ ASSERT_OK(array->ValidateFull());
auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front());
auto slices = this->Slices(array, offsets);
ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
@@ -154,6 +155,13 @@ TEST_F(ConcatenateTest, StringType) {
});
}
+TEST_F(ConcatenateTest, StringViewType) {
+ Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+ *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability);
+ ASSERT_OK((**out).ValidateFull());
+ });
+}
+
TEST_F(ConcatenateTest, LargeStringType) {
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
*out =
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 68250f0288..5d1c3294c0 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -261,7 +261,13 @@ class RangeDataEqualsImpl {
// Also matches StringViewType
Status Visit(const BinaryViewType& type) {
- return Status::NotImplemented("Binary / string view");
+ auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_;
+ auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_;
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return std::equal(left_values + i, left_values + i + length,
+ right_values + i, right_values + i + length);
+ });
+ return Status::OK();
}
// Also matches LargeStringType
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 3213273474..e45e296ff6 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -362,13 +362,12 @@ std::shared_ptr<Array> RandomArrayGenerator::Decimal256(std::shared_ptr<DataType
return gen.MakeRandomArray(size, null_probability, alignment, memory_pool);
}
-template <typename TypeClass>
+template <typename TypeClass, typename offset_type = typename TypeClass::offset_type>
static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size,
int32_t min_length, int32_t max_length,
double null_probability,
int64_t alignment,
MemoryPool* memory_pool) {
- using offset_type = typename TypeClass::offset_type;
using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
@@ -386,7 +385,7 @@ static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int
/*null_probability=*/0);
std::vector<uint8_t> str_buffer(max_length);
- BuilderType builder(memory_pool, alignment);
+ BuilderType builder{memory_pool, alignment};
for (int64_t i = 0; i < size; ++i) {
if (lengths->IsValid(i)) {
@@ -429,6 +428,15 @@ std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats(
return *strings->View(binary());
}
+std::shared_ptr<Array> RandomArrayGenerator::StringView(int64_t size, int32_t min_length,
+ int32_t max_length,
+ double null_probability,
+ int64_t alignment,
+ MemoryPool* memory_pool) {
+ return GenerateBinaryArray<StringViewType, uint32_t>(this, size, min_length, max_length,
+ null_probability, alignment, memory_pool);
+}
+
std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(
int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
double null_probability, int64_t alignment, MemoryPool* memory_pool) {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index b2e3a609a2..5b905896f2 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -367,6 +367,22 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
int64_t alignment = kDefaultBufferAlignment,
MemoryPool* memory_pool = default_memory_pool());
+ /// \brief Generate a random StringViewArray
+ ///
+ /// \param[in] size the size of the array to generate
+ /// \param[in] min_length the lower bound of the string length
+ /// determined by the uniform distribution
+ /// \param[in] max_length the upper bound of the string length
+ /// determined by the uniform distribution
+ /// \param[in] alignment alignment for memory allocations (in bytes)
+ /// \param[in] null_probability the probability of a value being null
+ ///
+ /// \return a generated Array
+ std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length,
+ double null_probability = 0,
+ int64_t alignment = kDefaultBufferAlignment,
+ MemoryPool* memory_pool = default_memory_pool());
+
/// \brief Generate a random LargeStringArray
///
/// \param[in] size the size of the array to generate