You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/05/11 16:54:42 UTC

[arrow] branch master updated: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9ae12a56b2 ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
9ae12a56b2 is described below

commit 9ae12a56b202a0638a313e26a3f133acfefa4dd4
Author: Alvin Chunga <al...@gmail.com>
AuthorDate: Wed May 11 18:54:31 2022 +0200

    ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
    
    Create Iterator method in stl for Array and ChunkedArray
    
    Closes #13009 from AlvinJ15/ARROW-602#Provide_iterator_access_to_primitive_elements_inside_chunked_arrays
    
    Lead-authored-by: Alvin Chunga <al...@gmail.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/chunked_array.h      |   6 +
 cpp/src/arrow/stl_iterator.h       | 158 ++++++++++++++++++++
 cpp/src/arrow/stl_iterator_test.cc | 297 +++++++++++++++++++++++++++++++++++++
 3 files changed, 461 insertions(+)

diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h
index e130a99bbe..956595b117 100644
--- a/cpp/src/arrow/chunked_array.h
+++ b/cpp/src/arrow/chunked_array.h
@@ -36,6 +36,10 @@ namespace arrow {
 class Array;
 class DataType;
 class MemoryPool;
+namespace stl {
+template <typename T, typename V>
+class ChunkedArrayIterator;
+}  // namespace stl
 
 /// \class ChunkedArray
 /// \brief A data structure managing a list of primitive Arrow arrays logically
@@ -183,6 +187,8 @@ class ARROW_EXPORT ChunkedArray {
   int64_t null_count_;
 
  private:
+  template <typename T, typename V>
+  friend class ::arrow::stl::ChunkedArrayIterator;
   internal::ChunkResolver chunk_resolver_;
   ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
 };
diff --git a/cpp/src/arrow/stl_iterator.h b/cpp/src/arrow/stl_iterator.h
index 6225a89aae..e1eeb33fba 100644
--- a/cpp/src/arrow/stl_iterator.h
+++ b/cpp/src/arrow/stl_iterator.h
@@ -17,11 +17,15 @@
 
 #pragma once
 
+#include <cassert>
 #include <cstddef>
 #include <iterator>
 #include <utility>
 
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
 #include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/optional.h"
 
@@ -59,10 +63,12 @@ class ArrayIterator {
 
   // Value access
   value_type operator*() const {
+    assert(array_);
     return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
   }
 
   value_type operator[](difference_type n) const {
+    assert(array_);
     return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
   }
 
@@ -128,6 +134,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() noexcept : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array,
+                                int64_t index = 0) noexcept
+      : chunked_array_(&chunked_array), index_(index) {}
+
+  // Value access
+  value_type operator*() const {
+    auto chunk_location = GetChunkLocation(index_);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  value_type operator[](difference_type n) const { return *(*this + n); }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    assert(chunked_array_);
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    assert(chunked_array_);
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    assert(other.chunked_array_);
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    assert(other.chunked_array_);
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    assert(chunked_array_);
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+};
+
+/// Return an iterator to the beginning of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
+  return ChunkedArrayIterator<ArrayType>(chunked_array);
+}
+
+/// Return an iterator to the end of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
+  return ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
+}
+
+template <typename ArrayType>
+struct ChunkedArrayRange {
+  const ChunkedArray* chunked_array;
+
+  ChunkedArrayIterator<ArrayType> begin() {
+    return stl::ChunkedArrayIterator<ArrayType>(*chunked_array);
+  }
+  ChunkedArrayIterator<ArrayType> end() {
+    return stl::ChunkedArrayIterator<ArrayType>(*chunked_array, chunked_array->length());
+  }
+};
+
+/// Return an iterable range over the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayRange<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayRange<ArrayType>{&chunked_array};
+}
+
 }  // namespace stl
 }  // namespace arrow
 
@@ -143,4 +291,14 @@ struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
   using iterator_category = typename IteratorType::iterator_category;
 };
 
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ChunkedArrayIterator<ArrayType>> {
+  using IteratorType = ::arrow::stl::ChunkedArrayIterator<ArrayType>;
+  using difference_type = typename IteratorType::difference_type;
+  using value_type = typename IteratorType::value_type;
+  using pointer = typename IteratorType::pointer;
+  using reference = typename IteratorType::reference;
+  using iterator_category = typename IteratorType::iterator_category;
+};
+
 }  // namespace std
diff --git a/cpp/src/arrow/stl_iterator_test.cc b/cpp/src/arrow/stl_iterator_test.cc
index 864011dbfc..d4a011e450 100644
--- a/cpp/src/arrow/stl_iterator_test.cc
+++ b/cpp/src/arrow/stl_iterator_test.cc
@@ -248,5 +248,302 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
+  auto it = Begin<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);
+}
+
+TEST(ChunkedArrayIterator, Arithmetic) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Begin<Int32Type>(*result);
+  auto it2 = it + 2;
+  auto end = End<Int32Type>(*result);
+
+  ASSERT_EQ(*it, 4);
+  ASSERT_EQ(*it2, nullopt);
+  ASSERT_EQ(it2 - it, 2);
+  ASSERT_EQ(it - it2, -2);
+  ASSERT_EQ(end - it2, 4);
+  ASSERT_EQ(it2 - end, -4);
+  auto it3 = it++;
+  ASSERT_EQ(it2 - it, 1);
+  ASSERT_EQ(it2 - it3, 2);
+  ASSERT_EQ(*it3, 4);
+  ASSERT_EQ(*it, 5);
+  auto it4 = ++it;
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(*it4, nullopt);
+  ASSERT_EQ(it2 - it, 0);
+  ASSERT_EQ(it2 - it4, 0);
+  auto it5 = it + 3;
+  ASSERT_EQ(*it5, 7);
+  ASSERT_EQ(*(it5 - 2), 6);
+  ASSERT_EQ(*(it5 + (-2)), 6);
+  auto it6 = (--it5)--;
+  ASSERT_EQ(*it6, nullopt);
+  ASSERT_EQ(*it5, 6);
+  ASSERT_EQ(it6 - it5, 1);
+}
+
+TEST(ChunkedArrayIterator, Comparison) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Begin<Int32Type>(*result) + 2;
+  auto it2 = Begin<Int32Type>(*result) + 2;
+  auto it3 = Begin<Int32Type>(*result) + 4;
+  auto it4 = Begin<Int32Type>(*result) + 6;
+  auto end = End<Int32Type>(*result);
+
+  ASSERT_TRUE(it == it2);
+  ASSERT_TRUE(it <= it2);
+  ASSERT_TRUE(it >= it2);
+  ASSERT_FALSE(it != it2);
+  ASSERT_FALSE(it < it2);
+  ASSERT_FALSE(it > it2);
+
+  ASSERT_FALSE(it == it3);
+  ASSERT_TRUE(it <= it3);
+  ASSERT_FALSE(it >= it3);
+  ASSERT_TRUE(it != it3);
+  ASSERT_TRUE(it < it3);
+  ASSERT_FALSE(it > it3);
+
+  ASSERT_FALSE(it3 == end);
+  ASSERT_TRUE(it3 <= end);
+  ASSERT_TRUE(it3 < end);
+  ASSERT_TRUE(it4 == end);
+  ASSERT_TRUE(it4 <= end);
+  ASSERT_FALSE(it4 < end);
+}
+
+TEST(ChunkedArrayIterator, MultipleChunks) {
+  auto result =
+      ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])", R"([7, 9, 10, 8])",
+                                     R"([11, 13])", R"([14])", R"([15])", R"([16])"});
+
+  auto it = Begin<Int32Type>(*result);
+  auto end = End<Int32Type>(*result);
+
+  ASSERT_EQ(it[8], 11);
+  ASSERT_EQ(it[9], 13);
+  it += 3;
+  ASSERT_EQ(it[0], 6);
+  ++it;
+  ASSERT_EQ(it[0], 7);
+  ASSERT_EQ(*it, 7);
+  ASSERT_EQ(it[1], 9);
+  ASSERT_EQ(it[2], 10);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 6);
+  ASSERT_EQ(it[4], 7);
+  it += 9;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  --it;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+  ASSERT_EQ(it[3], 14);
+  ASSERT_EQ(it[4], 15);
+  ASSERT_EQ(it[5], 16);
+  ++it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 14);
+  ASSERT_EQ(it[3], 15);
+  ASSERT_EQ(it[4], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  ASSERT_EQ(it[3], 16);
+  ++it;
+  ++it;
+  ASSERT_EQ(*it, 15);
+  ASSERT_NE(it, end);
+  ASSERT_NE(it + 1, end);
+  ASSERT_EQ(it + 2, end);
+
+  ASSERT_EQ(it[0], 15);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 14);
+  ASSERT_EQ(it[0], 14);
+  ASSERT_EQ(it[1], 15);
+  ASSERT_EQ(it[2], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  it -= 2;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+
+TEST(ChunkedArrayIterator, EmptyChunks) {
+  auto result = ChunkedArrayFromJSON(int32(), {});
+  auto it = Begin<Int32Type>(*result);
+  auto end = End<Int32Type>(*result);
+  ASSERT_EQ(it, end);
+
+  result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([])", R"([7, 9, 10, 8])",
+                                          R"([11, 13])", R"([])", R"([])", R"([16])"});
+
+  it = Begin<Int32Type>(*result);
+  end = End<Int32Type>(*result);
+
+  ASSERT_NE(it, end);
+  ASSERT_EQ(it[8], 13);
+  ASSERT_EQ(it[9], 16);
+  it += 3;
+  ASSERT_EQ(it[0], 7);
+  ++it;
+  ASSERT_EQ(it[0], 9);
+  ASSERT_EQ(*it, 9);
+  ASSERT_EQ(it[1], 10);
+  ASSERT_EQ(it[2], 8);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 7);
+  ASSERT_EQ(it[4], 9);
+  it += 9;
+  ASSERT_EQ(*it, 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  ++it;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  ASSERT_NE(it, end);
+  ASSERT_EQ(it + 1, end);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  it += 2;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  it -= 3;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+
+// Test compatibility with various STL algorithms
+
+TEST(ChunkedArrayIterator, StdFind) {
+  auto chunked_array1 =
+      ChunkedArrayFromJSON(int32(), {R"([5, 10])", R"([null])", R"([16])"});
+  auto it_begin = Begin<Int32Type>(*chunked_array1);
+  auto it_end = End<Int32Type>(*chunked_array1);
+
+  auto it = std::find(it_begin, it_end, 10);
+  ASSERT_EQ(it.index(), 1);
+  it = std::find(it_begin, it_end, nullopt);
+  ASSERT_EQ(it.index(), 2);
+  it = std::find(it_begin, it_end, 20);
+  ASSERT_EQ(it, it_end);
+}
+
+TEST(ChunkedArrayIterator, StdFindIf) {
+  auto chunked_array =
+      ChunkedArrayFromJSON(int32(), {R"([5, 10])", R"([null])", R"([16])"});
+  auto it_begin = Begin<Int32Type>(*chunked_array);
+  auto it_end = End<Int32Type>(*chunked_array);
+
+  auto it = std::find_if(it_begin, it_end, [](optional<int32_t> item) {
+    return item.has_value() && *item >= 10;
+  });
+  ASSERT_EQ(it.index(), 1);
+}
+
+TEST(ChunkedArrayIterator, StdCountIf) {
+  auto chunked_array1 =
+      ChunkedArrayFromJSON(boolean(), {R"([true, null])", R"([null])", R"([false])"});
+
+  auto n = std::count_if(Begin<BooleanType>(*chunked_array1),
+                         End<BooleanType>(*chunked_array1),
+                         [](optional<bool> v) { return !v.has_value(); });
+  ASSERT_EQ(n, 2);
+}
+
+TEST(ChunkedArrayIterator, StdCopy) {
+  auto chunked_array1 =
+      ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([null, 8])"});
+  auto it1 = Begin<Int8Type>(*chunked_array1);
+
+  std::vector<optional<int8_t>> values;
+  std::copy(it1 + 1, End<Int8Type>(*chunked_array1), std::back_inserter(values));
+  std::vector<optional<int8_t>> expected{5, 6, {}, 8};
+  ASSERT_EQ(values, expected);
+}
+
+TEST(ChunkedArrayIterator, StdMerge) {
+  auto chunked_array1 =
+      ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([7, 11, 13, 15, null])"});
+  auto chunked_array2 =
+      ChunkedArrayFromJSON(int8(), {R"([8, 9])", R"([10])", R"([14])", R"([16])"});
+
+  auto cmp_lt = [](optional<int8_t> u, optional<int8_t> v) {
+    return u.has_value() && (!v.has_value() || *u < *v);
+  };
+
+  std::vector<optional<int8_t>> values;
+
+  std::merge(Begin<Int8Type>(*chunked_array1), End<Int8Type>(*chunked_array1),
+             Begin<Int8Type>(*chunked_array2), End<Int8Type>(*chunked_array2),
+             std::back_inserter(values), cmp_lt);
+  std::vector<optional<int8_t>> expected{4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, {}};
+  ASSERT_EQ(values, expected);
+}
+
+TEST(ChunkedArrayIterator, ForEachIterator) {
+  auto chunked_array =
+      ChunkedArrayFromJSON(int8(), {R"([4, 5])", R"([6])", R"([null, 8])"});
+  std::vector<int8_t> expected = {4, 5, 6, 8};
+  std::vector<int8_t> values;
+  for (auto elem : Iterate<Int8Type>(*chunked_array)) {
+    if (elem.has_value()) {
+      values.push_back(*elem);
+    }
+  }
+  ASSERT_EQ(values, expected);
+}
+
 }  // namespace stl
 }  // namespace arrow