You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/05/11 16:54:42 UTC
[arrow] branch master updated: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9ae12a56b2 ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
9ae12a56b2 is described below
commit 9ae12a56b202a0638a313e26a3f133acfefa4dd4
Author: Alvin Chunga <al...@gmail.com>
AuthorDate: Wed May 11 18:54:31 2022 +0200
ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
Create Iterator method in stl for Array and ChunkedArray
Closes #13009 from AlvinJ15/ARROW-602#Provide_iterator_access_to_primitive_elements_inside_chunked_arrays
Lead-authored-by: Alvin Chunga <al...@gmail.com>
Co-authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/chunked_array.h | 6 +
cpp/src/arrow/stl_iterator.h | 158 ++++++++++++++++++++
cpp/src/arrow/stl_iterator_test.cc | 297 +++++++++++++++++++++++++++++++++++++
3 files changed, 461 insertions(+)
diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h
index e130a99bbe..956595b117 100644
--- a/cpp/src/arrow/chunked_array.h
+++ b/cpp/src/arrow/chunked_array.h
@@ -36,6 +36,10 @@ namespace arrow {
class Array;
class DataType;
class MemoryPool;
+namespace stl {
+template <typename T, typename V>
+class ChunkedArrayIterator;
+} // namespace stl
/// \class ChunkedArray
/// \brief A data structure managing a list of primitive Arrow arrays logically
@@ -183,6 +187,8 @@ class ARROW_EXPORT ChunkedArray {
int64_t null_count_;
private:
+ template <typename T, typename V>
+ friend class ::arrow::stl::ChunkedArrayIterator;
internal::ChunkResolver chunk_resolver_;
ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
};
diff --git a/cpp/src/arrow/stl_iterator.h b/cpp/src/arrow/stl_iterator.h
index 6225a89aae..e1eeb33fba 100644
--- a/cpp/src/arrow/stl_iterator.h
+++ b/cpp/src/arrow/stl_iterator.h
@@ -17,11 +17,15 @@
#pragma once
+#include <cassert>
#include <cstddef>
#include <iterator>
#include <utility>
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
#include "arrow/util/macros.h"
#include "arrow/util/optional.h"
@@ -59,10 +63,12 @@ class ArrayIterator {
// Value access
value_type operator*() const {
+ assert(array_);
return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
}
value_type operator[](difference_type n) const {
+ assert(array_);
return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
}
@@ -128,6 +134,148 @@ class ArrayIterator {
int64_t index_;
};
+template <typename ArrayType,
+ typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+ using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+ using difference_type = int64_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+ using iterator_category = std::random_access_iterator_tag;
+
+ // Some algorithms need to default-construct an iterator
+ ChunkedArrayIterator() noexcept : chunked_array_(NULLPTR), index_(0) {}
+
+ explicit ChunkedArrayIterator(const ChunkedArray& chunked_array,
+ int64_t index = 0) noexcept
+ : chunked_array_(&chunked_array), index_(index) {}
+
+ // Value access
+ value_type operator*() const {
+ auto chunk_location = GetChunkLocation(index_);
+ ArrayIterator<ArrayType> target_iterator{
+ arrow::internal::checked_cast<const ArrayType&>(
+ *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+ return target_iterator[chunk_location.index_in_chunk];
+ }
+
+ value_type operator[](difference_type n) const { return *(*this + n); }
+
+ int64_t index() const { return index_; }
+
+ // Forward / backward
+ ChunkedArrayIterator& operator++() {
+ (*this) += 1;
+ return *this;
+ }
+ ChunkedArrayIterator& operator--() {
+ (*this) -= 1;
+ return *this;
+ }
+
+ ChunkedArrayIterator operator++(int) {
+ ChunkedArrayIterator tmp(*this);
+ ++*this;
+ return tmp;
+ }
+ ChunkedArrayIterator operator--(int) {
+ ChunkedArrayIterator tmp(*this);
+ --*this;
+ return tmp;
+ }
+
+ // Arithmetic
+ difference_type operator-(const ChunkedArrayIterator& other) const {
+ return index_ - other.index_;
+ }
+ ChunkedArrayIterator operator+(difference_type n) const {
+ assert(chunked_array_);
+ return ChunkedArrayIterator(*chunked_array_, index_ + n);
+ }
+ ChunkedArrayIterator operator-(difference_type n) const {
+ assert(chunked_array_);
+ return ChunkedArrayIterator(*chunked_array_, index_ - n);
+ }
+ friend inline ChunkedArrayIterator operator+(difference_type diff,
+ const ChunkedArrayIterator& other) {
+ assert(other.chunked_array_);
+ return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+ }
+ friend inline ChunkedArrayIterator operator-(difference_type diff,
+ const ChunkedArrayIterator& other) {
+ assert(other.chunked_array_);
+ return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+ }
+ ChunkedArrayIterator& operator+=(difference_type n) {
+ index_ += n;
+ return *this;
+ }
+ ChunkedArrayIterator& operator-=(difference_type n) {
+ (*this) += -n;
+ return *this;
+ }
+
+ // Comparisons
+ bool operator==(const ChunkedArrayIterator& other) const {
+ return index_ == other.index_;
+ }
+ bool operator!=(const ChunkedArrayIterator& other) const {
+ return index_ != other.index_;
+ }
+ bool operator<(const ChunkedArrayIterator& other) const {
+ return index_ < other.index_;
+ }
+ bool operator>(const ChunkedArrayIterator& other) const {
+ return index_ > other.index_;
+ }
+ bool operator<=(const ChunkedArrayIterator& other) const {
+ return index_ <= other.index_;
+ }
+ bool operator>=(const ChunkedArrayIterator& other) const {
+ return index_ >= other.index_;
+ }
+
+ private:
+ arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+ assert(chunked_array_);
+ return chunked_array_->chunk_resolver_.Resolve(index);
+ }
+
+ const ChunkedArray* chunked_array_;
+ int64_t index_;
+};
+
+/// Return an iterator to the beginning of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
+ return ChunkedArrayIterator<ArrayType>(chunked_array);
+}
+
+/// Return an iterator to the end of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
+ return ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
+}
+
+template <typename ArrayType>
+struct ChunkedArrayRange {
+ const ChunkedArray* chunked_array;
+
+ ChunkedArrayIterator<ArrayType> begin() {
+ return stl::ChunkedArrayIterator<ArrayType>(*chunked_array);
+ }
+ ChunkedArrayIterator<ArrayType> end() {
+ return stl::ChunkedArrayIterator<ArrayType>(*chunked_array, chunked_array->length());
+ }
+};
+
+/// Return an iterable range over the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayRange<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+ return stl::ChunkedArrayRange<ArrayType>{&chunked_array};
+}
+
} // namespace stl
} // namespace arrow
@@ -143,4 +291,14 @@ struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
using iterator_category = typename IteratorType::iterator_category;
};
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ChunkedArrayIterator<ArrayType>> {
+ using IteratorType = ::arrow::stl::ChunkedArrayIterator<ArrayType>;
+ using difference_type = typename IteratorType::difference_type;
+ using value_type = typename IteratorType::value_type;
+ using pointer = typename IteratorType::pointer;
+ using reference = typename IteratorType::reference;
+ using iterator_category = typename IteratorType::iterator_category;
+};
+
} // namespace std
diff --git a/cpp/src/arrow/stl_iterator_test.cc b/cpp/src/arrow/stl_iterator_test.cc
index 864011dbfc..d4a011e450 100644
--- a/cpp/src/arrow/stl_iterator_test.cc
+++ b/cpp/src/arrow/stl_iterator_test.cc
@@ -248,5 +248,302 @@ TEST(ArrayIterator, StdMerge) {
ASSERT_EQ(values, expected);
}
+TEST(ChunkedArrayIterator, Basics) {
+ auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
+ auto it = Begin<Int32Type>(*result);
+ optional<int32_t> v = *it;
+ ASSERT_EQ(v, 4);
+ ASSERT_EQ(it[0], 4);
+ ++it;
+ ASSERT_EQ(it[0], 5);
+ ASSERT_EQ(*it, 5);
+ ASSERT_EQ(it[1], nullopt);
+ ASSERT_EQ(it[2], 6);
+}
+
+TEST(ChunkedArrayIterator, Arithmetic) {
+ auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+ auto it = Begin<Int32Type>(*result);
+ auto it2 = it + 2;
+ auto end = End<Int32Type>(*result);
+
+ ASSERT_EQ(*it, 4);
+ ASSERT_EQ(*it2, nullopt);
+ ASSERT_EQ(it2 - it, 2);
+ ASSERT_EQ(it - it2, -2);
+ ASSERT_EQ(end - it2, 4);
+ ASSERT_EQ(it2 - end, -4);
+ auto it3 = it++;
+ ASSERT_EQ(it2 - it, 1);
+ ASSERT_EQ(it2 - it3, 2);
+ ASSERT_EQ(*it3, 4);
+ ASSERT_EQ(*it, 5);
+ auto it4 = ++it;
+ ASSERT_EQ(*it, nullopt);
+ ASSERT_EQ(*it4, nullopt);
+ ASSERT_EQ(it2 - it, 0);
+ ASSERT_EQ(it2 - it4, 0);
+ auto it5 = it + 3;
+ ASSERT_EQ(*it5, 7);
+ ASSERT_EQ(*(it5 - 2), 6);
+ ASSERT_EQ(*(it5 + (-2)), 6);
+ auto it6 = (--it5)--;
+ ASSERT_EQ(*it6, nullopt);
+ ASSERT_EQ(*it5, 6);
+ ASSERT_EQ(it6 - it5, 1);
+}
+
+TEST(ChunkedArrayIterator, Comparison) {
+ auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+ auto it = Begin<Int32Type>(*result) + 2;
+ auto it2 = Begin<Int32Type>(*result) + 2;
+ auto it3 = Begin<Int32Type>(*result) + 4;
+ auto it4 = Begin<Int32Type>(*result) + 6;
+ auto end = End<Int32Type>(*result);
+
+ ASSERT_TRUE(it == it2);
+ ASSERT_TRUE(it <= it2);
+ ASSERT_TRUE(it >= it2);
+ ASSERT_FALSE(it != it2);
+ ASSERT_FALSE(it < it2);
+ ASSERT_FALSE(it > it2);
+
+ ASSERT_FALSE(it == it3);
+ ASSERT_TRUE(it <= it3);
+ ASSERT_FALSE(it >= it3);
+ ASSERT_TRUE(it != it3);
+ ASSERT_TRUE(it < it3);
+ ASSERT_FALSE(it > it3);
+
+ ASSERT_FALSE(it3 == end);
+ ASSERT_TRUE(it3 <= end);
+ ASSERT_TRUE(it3 < end);
+ ASSERT_TRUE(it4 == end);
+ ASSERT_TRUE(it4 <= end);
+ ASSERT_FALSE(it4 < end);
+}
+
+TEST(ChunkedArrayIterator, MultipleChunks) {
+ auto result =
+ ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])", R"([7, 9, 10, 8])",
+ R"([11, 13])", R"([14])", R"([15])", R"([16])"});
+
+ auto it = Begin<Int32Type>(*result);
+ auto end = End<Int32Type>(*result);
+
+ ASSERT_EQ(it[8], 11);
+ ASSERT_EQ(it[9], 13);
+ it += 3;
+ ASSERT_EQ(it[0], 6);
+ ++it;
+ ASSERT_EQ(it[0], 7);
+ ASSERT_EQ(*it, 7);
+ ASSERT_EQ(it[1], 9);
+ ASSERT_EQ(it[2], 10);
+ it -= 4;
+ ASSERT_EQ(it[0], 4);
+ ASSERT_EQ(it[1], 5);
+ ASSERT_EQ(it[2], nullopt);
+ ASSERT_EQ(it[3], 6);
+ ASSERT_EQ(it[4], 7);
+ it += 9;
+ ASSERT_EQ(*it, 13);
+ --it;
+ ASSERT_EQ(*it, 11);
+ --it;
+ ASSERT_EQ(*it, 8);
+ ASSERT_EQ(it[0], 8);
+ ASSERT_EQ(it[1], 11);
+ ASSERT_EQ(it[2], 13);
+ ASSERT_EQ(it[3], 14);
+ ASSERT_EQ(it[4], 15);
+ ASSERT_EQ(it[5], 16);
+ ++it;
+ ASSERT_EQ(*it, 11);
+ ASSERT_EQ(it[0], 11);
+ ASSERT_EQ(it[1], 13);
+ ASSERT_EQ(it[2], 14);
+ ASSERT_EQ(it[3], 15);
+ ASSERT_EQ(it[4], 16);
+ ++it;
+ ASSERT_EQ(*it, 13);
+ ASSERT_EQ(it[0], 13);
+ ASSERT_EQ(it[1], 14);
+ ASSERT_EQ(it[2], 15);
+ ASSERT_EQ(it[3], 16);
+ ++it;
+ ++it;
+ ASSERT_EQ(*it, 15);
+ ASSERT_NE(it, end);
+ ASSERT_NE(it + 1, end);
+ ASSERT_EQ(it + 2, end);
+
+ ASSERT_EQ(it[0], 15);
+ ASSERT_EQ(it[1], 16);
+ --it;
+ ASSERT_EQ(*it, 14);
+ ASSERT_EQ(it[0], 14);
+ ASSERT_EQ(it[1], 15);
+ ASSERT_EQ(it[2], 16);
+ --it;
+ ASSERT_EQ(*it, 13);
+ ASSERT_EQ(it[0], 13);
+ ASSERT_EQ(it[1], 14);
+ ASSERT_EQ(it[2], 15);
+ it -= 2;
+ ASSERT_EQ(*it, 8);
+ ASSERT_EQ(it[0], 8);
+ ASSERT_EQ(it[1], 11);
+ ASSERT_EQ(it[2], 13);
+}
+
+TEST(ChunkedArrayIterator, EmptyChunks) {
+ auto result = ChunkedArrayFromJSON(int32(), {});
+ auto it = Begin<Int32Type>(*result);
+ auto end = End<Int32Type>(*result);
+ ASSERT_EQ(it, end);
+
+ result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([])", R"([7, 9, 10, 8])",
+ R"([11, 13])", R"([])", R"([])", R"([16])"});
+
+ it = Begin<Int32Type>(*result);
+ end = End<Int32Type>(*result);
+
+ ASSERT_NE(it, end);
+ ASSERT_EQ(it[8], 13);
+ ASSERT_EQ(it[9], 16);
+ it += 3;
+ ASSERT_EQ(it[0], 7);
+ ++it;
+ ASSERT_EQ(it[0], 9);
+ ASSERT_EQ(*it, 9);
+ ASSERT_EQ(it[1], 10);
+ ASSERT_EQ(it[2], 8);
+ it -= 4;
+ ASSERT_EQ(it[0], 4);
+ ASSERT_EQ(it[1], 5);
+ ASSERT_EQ(it[2], nullopt);
+ ASSERT_EQ(it[3], 7);
+ ASSERT_EQ(it[4], 9);
+ it += 9;
+ ASSERT_EQ(*it, 16);
+ --it;
+ ASSERT_EQ(*it, 13);
+ --it;
+ ASSERT_EQ(*it, 11);
+ ASSERT_EQ(it[0], 11);
+ ASSERT_EQ(it[1], 13);
+ ASSERT_EQ(it[2], 16);
+ ++it;
+ ASSERT_EQ(*it, 13);
+ ASSERT_EQ(it[0], 13);
+ ASSERT_EQ(it[1], 16);
+ ++it;
+ ASSERT_EQ(*it, 16);
+ ASSERT_EQ(it[0], 16);
+ ASSERT_NE(it, end);
+ ASSERT_EQ(it + 1, end);
+ --it;
+ ASSERT_EQ(*it, 13);
+ ASSERT_EQ(it[0], 13);
+ ASSERT_EQ(it[1], 16);
+ --it;
+ ASSERT_EQ(*it, 11);
+ ASSERT_EQ(it[0], 11);
+ ASSERT_EQ(it[1], 13);
+ ASSERT_EQ(it[2], 16);
+ it += 2;
+ ASSERT_EQ(*it, 16);
+ ASSERT_EQ(it[0], 16);
+ it -= 3;
+ ASSERT_EQ(*it, 8);
+ ASSERT_EQ(it[0], 8);
+ ASSERT_EQ(it[1], 11);
+ ASSERT_EQ(it[2], 13);
+}
+
+// Test compatibility with various STL algorithms
+
+TEST(ChunkedArrayIterator, StdFind) {
+ auto chunked_array1 =
+ ChunkedArrayFromJSON(int32(), {R"([5, 10])", R"([null])", R"([16])"});
+ auto it_begin = Begin<Int32Type>(*chunked_array1);
+ auto it_end = End<Int32Type>(*chunked_array1);
+
+ auto it = std::find(it_begin, it_end, 10);
+ ASSERT_EQ(it.index(), 1);
+ it = std::find(it_begin, it_end, nullopt);
+ ASSERT_EQ(it.index(), 2);
+ it = std::find(it_begin, it_end, 20);
+ ASSERT_EQ(it, it_end);
+}
+
+TEST(ChunkedArrayIterator, StdFindIf) {
+ auto chunked_array =
+ ChunkedArrayFromJSON(int32(), {R"([5, 10])", R"([null])", R"([16])"});
+ auto it_begin = Begin<Int32Type>(*chunked_array);
+ auto it_end = End<Int32Type>(*chunked_array);
+
+ auto it = std::find_if(it_begin, it_end, [](optional<int32_t> item) {
+ return item.has_value() && *item >= 10;
+ });
+ ASSERT_EQ(it.index(), 1);
+}
+
+TEST(ChunkedArrayIterator, StdCountIf) {
+ auto chunked_array1 =
+ ChunkedArrayFromJSON(boolean(), {R"([true, null])", R"([null])", R"([false])"});
+
+ auto n = std::count_if(Begin<BooleanType>(*chunked_array1),
+ End<BooleanType>(*chunked_array1),
+ [](optional<bool> v) { return !v.has_value(); });
+ ASSERT_EQ(n, 2);
+}
+
+TEST(ChunkedArrayIterator, StdCopy) {
+ auto chunked_array1 =
+ ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([null, 8])"});
+ auto it1 = Begin<Int8Type>(*chunked_array1);
+
+ std::vector<optional<int8_t>> values;
+ std::copy(it1 + 1, End<Int8Type>(*chunked_array1), std::back_inserter(values));
+ std::vector<optional<int8_t>> expected{5, 6, {}, 8};
+ ASSERT_EQ(values, expected);
+}
+
+TEST(ChunkedArrayIterator, StdMerge) {
+ auto chunked_array1 =
+ ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([7, 11, 13, 15, null])"});
+ auto chunked_array2 =
+ ChunkedArrayFromJSON(int8(), {R"([8, 9])", R"([10])", R"([14])", R"([16])"});
+
+ auto cmp_lt = [](optional<int8_t> u, optional<int8_t> v) {
+ return u.has_value() && (!v.has_value() || *u < *v);
+ };
+
+ std::vector<optional<int8_t>> values;
+
+ std::merge(Begin<Int8Type>(*chunked_array1), End<Int8Type>(*chunked_array1),
+ Begin<Int8Type>(*chunked_array2), End<Int8Type>(*chunked_array2),
+ std::back_inserter(values), cmp_lt);
+ std::vector<optional<int8_t>> expected{4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, {}};
+ ASSERT_EQ(values, expected);
+}
+
+TEST(ChunkedArrayIterator, ForEachIterator) {
+ auto chunked_array =
+ ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([null, 8])"});
+ std::vector<int8_t> expected = {4, 5, 6, 8};
+ std::vector<int8_t> values;
+ for (auto elem : Iterate<Int8Type>(*chunked_array)) {
+ if (elem.has_value()) {
+ values.push_back(*elem);
+ }
+ }
+ ASSERT_EQ(values, expected);
+}
+
} // namespace stl
} // namespace arrow