You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/27 05:18:45 UTC

[GitHub] [arrow] AlvinJ15 opened a new pull request, #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

AlvinJ15 opened a new pull request, #13009:
URL: https://github.com/apache/arrow/pull/13009

   Create Iterator method in stl for Array and ChunkedArray


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1110563745

   https://issues.apache.org/jira/browse/ARROW-602


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859921831


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   [`ChunkedArray` already has `GetScalar(index)`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L145) which is efficient for accessing sequential scalars at particular logical indices. I suggest to use it for the iteration. There is no need to perform complex index calculations/checks in the iterator. This would be somewhat analogous to the [`Array` iterator](https://github.com/apache/arrow/blob/master/cpp/src/arrow/stl_iterator.h#L61-L67).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859988212


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }

Review Comment:
   This can be simplified to have a single impl because `operator-=(n) == operator+=(-n)`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860859259


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   I would expect iterator tests to have a loop form. For example, you can iterate through a ChunkedArray and store the values into a `std::vector` and then make assertions against the vector.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869975583


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   @edponce Hmm, possibly. The C++ spec doesn't clearly say whether creating an out-of-bounds iterator is well-defined. However, the [operational semantics](https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator) involve incrementing/decrementing and it seems that incrementing/decrementing past extremities is undefined. @bkietz Do you have any thoughts on this?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869615839


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   DCHECKs are not allowed in public headers. We could perhaps use an `assert` though.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1125801395

   Benchmark runs are scheduled for baseline = 0bae875bb8d992aab762e7fd81eef24bda7963ad and contender = 9ae12a56b202a0638a313e26a3f133acfefa4dd4. 9ae12a56b202a0638a313e26a3f133acfefa4dd4 is a master commit associated with this PR. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Finished :arrow_down:0.0% :arrow_up:0.0%] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/cb0c56eae5754aeeb7971646d7f09210...5f8eda00981a40e58a7aa6bab3ab550d/)
   [Failed :arrow_down:0.31% :arrow_up:0.0%] [test-mac-arm](https://conbench.ursa.dev/compare/runs/eea27e77e2164154b954c172cf773500...b7f12ccac3e0452a921af35079a2b335/)
   [Finished :arrow_down:2.5% :arrow_up:0.36%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/8d60cd7893f74a06b375d7a30c85a33e...00c1b6d43c0748cb8102e6ae37330c31/)
   [Finished :arrow_down:1.54% :arrow_up:0.43%] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/896aec7197dd497297c2427d51e9a734...6041aa12ab31411eb2ad05524d6e5fa9/)
   Buildkite builds:
   [Finished] [`9ae12a56` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/740)
   [Finished] [`9ae12a56` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/737)
   [Finished] [`9ae12a56` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/727)
   [Finished] [`9ae12a56` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/742)
   [Finished] [`0bae875b` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/739)
   [Failed] [`0bae875b` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/736)
   [Finished] [`0bae875b` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/726)
   [Finished] [`0bae875b` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/741)
   Supported benchmarks:
   ec2-t3-xlarge-us-east-2: Supported benchmark langs: Python, R. Runs only benchmarks with cloud = True
   test-mac-arm: Supported benchmark langs: C++, Python, R
   ursa-i9-9960x: Supported benchmark langs: Python, R, JavaScript
   ursa-thinkcentre-m75q: Supported benchmark langs: C++, Java
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112397408

   > ...I still think it can work for the `Array` types that currently support iteration (std::enable_if magic).
   
   `begin()` and `end()` must return a well-known iterator type and it cannot depend on the datatype.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859945687


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   @edponce The `GetScalar` suggestion doesn't seem right to me. The iterator should just yield C values, not boxed scalars which are much more expensive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869625380


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   I think the check should be `if (index_ < chunked_array_->length())` so that it does not include index values that exceed or equal the length. Also, [here](https://github.com/apache/arrow/pull/13009/files#diff-bfad5d7b30a1afe013c36184f9a807cc4c2d9a5cfed3da2e1414b2662731e6efR149).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] bkietz commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
bkietz commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870385124


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   Personally, my preference is to maximize the similarity of any random access iterators to indices/non-owning pointers. In this case I'd try to avoid accessing elements at all when incrementing the iterator, deferring the resolution of `current_array_iterator_` until something is actually dereferenced. This would also move out-of-bounds/invalid current array checking into the dereferencing member functions where correct behavior is far more obvious.
   
   The specific question of how to validate out-of-bounds iterators becomes easier to answer too: We can construct an iterator with whatever index and compare/increment/etc without worry since that operates only on the index.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859959406


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   Well, ok I see now that it is using the `Array` iterators.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860876661


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   It depends what you call "typical" :-) Some algorithms may require random access iterators, or be faster on random access iterators than on forward-only iterators.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860702668


##########
cpp/src/arrow/chunked_array.h:
##########
@@ -141,6 +141,9 @@ class ARROW_EXPORT ChunkedArray {
   /// \brief Return the type of the chunked array
   const std::shared_ptr<DataType>& type() const { return type_; }
 
+  /// \brief Return chunk resolver of the chunked array
+  const internal::ChunkResolver& GetChunkResolver() const { return chunk_resolver_; }

Review Comment:
   This is exposing an internal facility as a public member API. I don't know what our API hygiene for this should be. @westonpace @xhochy @lidavidm Thoughts?
   
   (we could also make this private and use a `friend` declaration for the consuming class)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r862160969


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    current_array_iterator_ =
+        ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+            chunked_array_->chunk(chunk_location.chunk_index)), index_);
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -=
+        this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(chunk_location.chunk_index))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      current_array_iterator_ -=
+          current_array_iterator_.index() - chunk_location.index_in_chunk;
+    } else {
+      current_array_iterator_ =
+          ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+                                       chunked_array_->chunk(chunk_location.chunk_index)),
+                                   chunk_location.index_in_chunk);
+      current_chunk_index_ = chunk_location.chunk_index;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ArrayIterator<ArrayType> Iterate(const Array& array) {
+  return stl::ArrayIterator<ArrayType>(&array);
+}
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}

Review Comment:
   Not ideal.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r862089851


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    current_array_iterator_ =
+        ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+            chunked_array_->chunk(chunk_location.chunk_index)), index_);
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -=
+        this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(chunk_location.chunk_index))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      current_array_iterator_ -=
+          current_array_iterator_.index() - chunk_location.index_in_chunk;
+    } else {
+      current_array_iterator_ =
+          ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+                                       chunked_array_->chunk(chunk_location.chunk_index)),
+                                   chunk_location.index_in_chunk);
+      current_chunk_index_ = chunk_location.chunk_index;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ArrayIterator<ArrayType> Iterate(const Array& array) {
+  return stl::ArrayIterator<ArrayType>(&array);
+}
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}

Review Comment:
   I could add a condition for validate that and raise properly an error message, but I don't know if this is a good solution for that problem.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869616447


##########
cpp/src/arrow/chunked_array.h:
##########
@@ -36,6 +36,10 @@ namespace arrow {
 class Array;
 class DataType;
 class MemoryPool;
+namespace stl {
+template <typename T, typename V>
+class ChunkedArrayIterator;
+}

Review Comment:
   It should indeed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870292161


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   It seems that any value is valid as long as you can return to the initial value with its inverse operation. So the check `!=` follows the operational semantics of a RandomAccessIterator.
   Actually, the `<` check would be insufficient because it allows negative indices (need absolute value of index).
   
   The question now becomes, should Arrow iterators guard against out-of-bounds accesses?
   From a performance perspective, they have basically the same cost (+ std::abs).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859978529


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  int64_t GetChunkIndex(int64_t index) const {
+    return static_cast<int64_t>(
+        std::upper_bound(chunks_lengths_.begin(), chunks_lengths_.end(), index) -
+        chunks_lengths_.begin());
+  }
+
+  void InitializeComponents(ChunkedArrayIterator<ArrayType>& chunked_array_iterator,
+                            bool update = false) {
+    if (!update) {
+      int64_t chunk_index = 0;
+      for (const auto& array : chunked_array_iterator.chunked_array_->chunks()) {
+        chunked_array_iterator.iterators_list_.emplace_back(
+            *arrow::internal::checked_pointer_cast<ArrayType>(array));
+        auto chunk_length = static_cast<int64_t>(array->length());
+        if (chunk_index) {
+          chunked_array_iterator.chunks_lengths_.push_back(
+              chunk_length + chunked_array_iterator.chunks_lengths_[chunk_index - 1]);
+        } else {
+          chunked_array_iterator.chunks_lengths_.push_back(chunk_length);
+        }
+        chunk_index++;

Review Comment:
   Nit: Factor out the push_back operation from the `if-else`
   ```c++
   auto chunk_length = ...;
   if (chunk_index) {
      chunk_length += chunked_array_iterator.chunks_lengths_[chunk_index - 1]);
   }
   chunked_array_iterator.chunks_lengths_.push_back(chunk_length);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860861167


##########
cpp/src/arrow/chunked_array.h:
##########
@@ -141,6 +141,9 @@ class ARROW_EXPORT ChunkedArray {
   /// \brief Return the type of the chunked array
   const std::shared_ptr<DataType>& type() const { return type_; }
 
+  /// \brief Return chunk resolver of the chunked array
+  const internal::ChunkResolver& GetChunkResolver() const { return chunk_resolver_; }

Review Comment:
   I am in favor of using a `friend` declaration.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869971878


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }

Review Comment:
   There is a similar method in ArrayIterator, for calculate the difference between two iterators(like a distance), I think make a  + operator  would not be used



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1113603762

   @pitrou @lidavidm I see now the issue with typing and `ChunkedArray`. Related to this, @AlvinJ15 attempted to add `ChunkedArrayIterator` as a friend class to `ChunkedArray`, but `ChunkedArrayIterator` requires the `ArrayType` template parameter.
   
   `ChunkedArray` is agnostic of the `Array` type which `ChunkedArrayIterator` requires. Nevertheless, we can get the `ArrayType` via `TypeTraits` if we have the data type at compile-time. [Specialized `Arrays` have the `TypeClass` attribute](https://github.com/apache/arrow/blob/master/cpp/src/arrow/array/array_primitive.h#L41) which would work, but this attribute is not part of the base `Array` class which is what `ChunkedArray` works with.
   
   If we could add a `TypeClass` attribute to base `Array` then I think we are good, but what value would it have?
   Other ideas?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1113609517

   > If we could add a `TypeClass` attribute to base `Array` then I think we are good, but what value would it have?
   
   How would this work? The value would vary at runtime, but we need it to be compile time.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859921831


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   [`ChunkedArray` already has `GetScalar(index)`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L145) which is efficient for accessing sequential scalars at particular logical indices. I suggest to use it for the iteration.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860872205


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   Right, but I don't think subscripting is part of the typical iterator operations right? It works for pointers but not necessarily other iterators?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1122602707

   @westonpace @lidavidm Feel free to leave any comments on the API even after this is merged.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859983654


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  int64_t GetChunkIndex(int64_t index) const {
+    return static_cast<int64_t>(
+        std::upper_bound(chunks_lengths_.begin(), chunks_lengths_.end(), index) -
+        chunks_lengths_.begin());
+  }
+
+  void InitializeComponents(ChunkedArrayIterator<ArrayType>& chunked_array_iterator,
+                            bool update = false) {
+    if (!update) {
+      int64_t chunk_index = 0;
+      for (const auto& array : chunked_array_iterator.chunked_array_->chunks()) {
+        chunked_array_iterator.iterators_list_.emplace_back(
+            *arrow::internal::checked_pointer_cast<ArrayType>(array));
+        auto chunk_length = static_cast<int64_t>(array->length());
+        if (chunk_index) {
+          chunked_array_iterator.chunks_lengths_.push_back(
+              chunk_length + chunked_array_iterator.chunks_lengths_[chunk_index - 1]);
+        } else {
+          chunked_array_iterator.chunks_lengths_.push_back(chunk_length);
+        }
+        chunk_index++;
+      }
+    }
+
+    chunked_array_iterator.current_chunk_index_ =
+        GetChunkIndex(chunked_array_iterator.index_);
+    auto& current_iterator =
+        chunked_array_iterator
+            .iterators_list_[chunked_array_iterator.current_chunk_index_];
+    current_iterator -= current_iterator.index();
+    if (chunked_array_iterator.current_chunk_index_)
+      current_iterator +=
+          chunked_array_iterator.index_ -
+          chunked_array_iterator
+              .chunks_lengths_[chunked_array_iterator.current_chunk_index_ - 1];
+    else
+      current_iterator += chunked_array_iterator.index_;
+  }

Review Comment:
   Not sure if this is enforced in Arrow but I suggest to always use curly-braces for `if-else` blocks.
   Nit: This can be simplified to
   ```c++
   auto& current_iterator = chunked_array_iterator.iterators_list_[chunked_array_iterator.current_chunk_index_] - current_iterator.index() + chunked_array_iterator.index_;
   if (chunked_array_iterator.current_chunk_index_) {
     current_iterator += chunked_array_iterator.chunks_lengths_[chunked_array_iterator.current_chunk_index_ - 1];
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869616519


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   Ah right, I always forget that…



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112381270

   > That wouldn't work, because there are no specialized subclasses of ChunkedArray. We want the iterator to yield raw C values whose type is datatype-specific.
   ...I still think it can work for the `Array` types that currently support iteration (std::enable_if magic). The `ChunkedArrayIterator` in this PR can extract raw C values from a `ChunkedArray` so I am not suggesting something different.
   
   > It's used by `ApplyBinaryChunked`.
   My mistake, did a grep and thought only declaration/definition were in chunked_array.h/cc
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860854295


##########
cpp/src/arrow/chunked_array.h:
##########
@@ -141,6 +141,9 @@ class ARROW_EXPORT ChunkedArray {
   /// \brief Return the type of the chunked array
   const std::shared_ptr<DataType>& type() const { return type_; }
 
+  /// \brief Return chunk resolver of the chunked array
+  const internal::ChunkResolver& GetChunkResolver() const { return chunk_resolver_; }

Review Comment:
   At the very least it needs to be clearly marked off as an internal API, right now it looks like any other public API. Ideally it would be private with a friend declaration.
   
   ```suggestion
     const internal::ChunkResolver& chunk_resolver() const { return chunk_resolver_; }
   ```



##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));

Review Comment:
   Note there's a `ChunkedArrayFromJSON`.
   
   Can we also test a chunked array with no chunks?



##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   Is indexing an iterator a typical operation?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112384179

   > > That wouldn't work, because there are no specialized subclasses of ChunkedArray. We want the iterator to yield raw C values whose type is datatype-specific.
   > 
   > ...I still think it can work for the `Array` types that currently support iteration (std::enable_if magic). The `ChunkedArrayIterator` in this PR can extract raw C values from a `ChunkedArray` so I am not suggesting something different.
   
   Er, you simply can't implement `begin` right? It would have to be
   
   ```
   template <typename Type>
   ChunkedArrayIterator<Type> begin() const;
   ```
   
   which is a little clumsy


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869635511


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {
+      auto chunk_location = GetChunkLocation(index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+/// Return an iterator to the beginning of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}
+
+/// Return an iterator to the end of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
+}

Review Comment:
   Nit: Sometimes `ChunkedArrayIterator<>` is used and other times `stl::ChunkedArrayIterator<>`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869972999


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }

Review Comment:
   @edponce See https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator for the requirements for the random access iterator.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }

Review Comment:
   @edponce See https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator for the requirements for a random access iterator.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870446880


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   @edponce In https://en.cppreference.com/w/cpp/named_req/ForwardIterator, you can see that only dereferenceable iterators can be incremented. And all other operations are derived from incrementation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860704871


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,199 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    UpdateComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    return iterators_list_[chunk_location.chunk_index]
+                          [chunk_location.index_in_chunk -
+                           iterators_list_[chunk_location.chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length() - 1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) {
+        current_chunk_index_ = GetChunkLocation(index_).chunk_index;
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) {
+        current_chunk_index_ = GetChunkLocation(index_).chunk_index;
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length() - 1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    bool in_current_chunk;
+    auto& current_iterator = iterators_list_[current_chunk_index_];
+    if (n >= 0) {
+      in_current_chunk =
+          n < chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length() -
+                  current_iterator.index();
+    } else {
+      in_current_chunk = n > current_iterator.index();
+    }
+    if (in_current_chunk) {
+      index_ += n;
+      current_iterator += n;
+      return *this;
+    } else {
+      current_iterator -= current_iterator.index();
+      index_ += n;
+      UpdateComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->GetChunkResolver().Resolve(index);
+  }
+
+  void UpdateComponents(ChunkedArrayIterator<ArrayType>& chunked_array_iterator,
+                        bool update = false) {
+    if (!update) {
+      int64_t chunk_index = 0;
+      for (const auto& array : chunked_array_iterator.chunked_array_->chunks()) {
+        chunked_array_iterator.iterators_list_.emplace_back(
+            *arrow::internal::checked_pointer_cast<ArrayType>(array));
+        auto chunk_length = array->length();
+        if (chunk_index) {
+          chunk_length += chunked_array_iterator.chunks_lengths_[chunk_index - 1];
+        }
+        chunked_array_iterator.chunks_lengths_.push_back(chunk_length);
+        chunk_index++;
+      }
+    }
+
+    chunked_array_iterator.current_chunk_index_ =
+        GetChunkLocation(chunked_array_iterator.index_).chunk_index;
+    auto& current_iterator =
+        chunked_array_iterator
+            .iterators_list_[chunked_array_iterator.current_chunk_index_];
+    current_iterator -= current_iterator.index() - chunked_array_iterator.index_;
+    if (chunked_array_iterator.current_chunk_index_) {
+      current_iterator -=
+          chunked_array_iterator
+              .chunks_lengths_[chunked_array_iterator.current_chunk_index_ - 1];
+    }
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  std::vector<int64_t> chunks_lengths_;
+  std::vector<ArrayIterator<ArrayType>> iterators_list_;

Review Comment:
   I'm a bit surprised, are these members still needed? Ideally, everything can be done using the ChunkResolver...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860697542


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  int64_t GetChunkIndex(int64_t index) const {
+    return static_cast<int64_t>(
+        std::upper_bound(chunks_lengths_.begin(), chunks_lengths_.end(), index) -
+        chunks_lengths_.begin());
+  }
+
+  void InitializeComponents(ChunkedArrayIterator<ArrayType>& chunked_array_iterator,
+                            bool update = false) {
+    if (!update) {
+      int64_t chunk_index = 0;
+      for (const auto& array : chunked_array_iterator.chunked_array_->chunks()) {
+        chunked_array_iterator.iterators_list_.emplace_back(
+            *arrow::internal::checked_pointer_cast<ArrayType>(array));
+        auto chunk_length = static_cast<int64_t>(array->length());
+        if (chunk_index) {
+          chunked_array_iterator.chunks_lengths_.push_back(
+              chunk_length + chunked_array_iterator.chunks_lengths_[chunk_index - 1]);
+        } else {
+          chunked_array_iterator.chunks_lengths_.push_back(chunk_length);
+        }
+        chunk_index++;
+      }
+    }
+
+    chunked_array_iterator.current_chunk_index_ =
+        GetChunkIndex(chunked_array_iterator.index_);
+    auto& current_iterator =
+        chunked_array_iterator
+            .iterators_list_[chunked_array_iterator.current_chunk_index_];
+    current_iterator -= current_iterator.index();
+    if (chunked_array_iterator.current_chunk_index_)
+      current_iterator +=
+          chunked_array_iterator.index_ -
+          chunked_array_iterator
+              .chunks_lengths_[chunked_array_iterator.current_chunk_index_ - 1];
+    else
+      current_iterator += chunked_array_iterator.index_;
+  }

Review Comment:
   I agree with this suggestion for the record. We don't enforce it but we try to maintain it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859948262


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  int64_t GetChunkIndex(int64_t index) const {
+    return static_cast<int64_t>(
+        std::upper_bound(chunks_lengths_.begin(), chunks_lengths_.end(), index) -
+        chunks_lengths_.begin());

Review Comment:
   We have `ChunkResolver` now which can be used without needing to reinvent it (and which should generally be faster).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859988212


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }

Review Comment:
   These operators can be simplified to have a single impl and reuse it because `operator-=(n) == operator+=(-n)`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1125801512

   ['Python', 'R'] benchmarks have high level of regressions.
   [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/8d60cd7893f74a06b375d7a30c85a33e...00c1b6d43c0748cb8102e6ae37330c31/)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869615839


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   DCHECKs are not allowed in public headers.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870470990


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   Additional observations on current `operator+=`:
   * [This dereference](https://github.com/apache/arrow/pull/13009/files/dddea9b448761ddf9e0137a9eb6d2c6de86bb1e3#diff-bfad5d7b30a1afe013c36184f9a807cc4c2d9a5cfed3da2e1414b2662731e6efR218) can be invalid because `chunk_index` is not validated and can be out-of-bounds.
   
   * An Array and an Iterator are constructed for every access via `+=`. There are opportunities to reuse `current_array_iterator_`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865038679


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   Well, first you should use `checked_cast` as there's no reason to create a new `shared_ptr`. Second, in release mode `checked_cast` is a static cast that doesn't have any actual cost.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1113763772

   @pitrou @lidavidm I apologize for all the unnecessary comments/discussion I made, I thought this was a new JIRA issue and had no idea of all the discussions that had happened in the JIRA comments.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859953533


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   Ok, well what about adding a `GetView` method to `ChunkedArray` which invokes the `GetView` of the underlying `Arrays` and use the [`ChunkedResolver` to resolve the index](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.cc#L152-L157)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112325513

   > 1. I suggest we add begin/end methods to `ChunkedArray`
   
   That wouldn't work, because there are no specialized subclasses of ChunkedArray. We want the iterator to yield raw C values whose type is datatype-specific.
   
   > There is an [experimental MultipleChunkIterator](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L194) that seems to not be used in the codebase.
   
   It's used by `ApplyBinaryChunked`.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r862073194


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    current_array_iterator_ =
+        ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+            chunked_array_->chunk(chunk_location.chunk_index)), index_);
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -=
+        this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(chunk_location.chunk_index))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      current_array_iterator_ -=
+          current_array_iterator_.index() - chunk_location.index_in_chunk;
+    } else {
+      current_array_iterator_ =
+          ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+                                       chunked_array_->chunk(chunk_location.chunk_index)),
+                                   chunk_location.index_in_chunk);
+      current_chunk_index_ = chunk_location.chunk_index;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ArrayIterator<ArrayType> Iterate(const Array& array) {
+  return stl::ArrayIterator<ArrayType>(&array);
+}
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}

Review Comment:
   Do we need these `Iterate` methods? One can simply create the corresponding iterator directly. Also, it is prone to error because it allows for the `Type` parameter to be different from the actual `Arrays` contained in the `chunked_array`.
   
   On the other hand, if we are to keep factory functions, the convention is to prefix them with `Make`, so `MakeIterator` would be more clear, as `Iterate` is an action verb.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r862160969


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    current_array_iterator_ =
+        ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+            chunked_array_->chunk(chunk_location.chunk_index)), index_);
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -=
+        this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(chunk_location.chunk_index))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      current_array_iterator_ -=
+          current_array_iterator_.index() - chunk_location.index_in_chunk;
+    } else {
+      current_array_iterator_ =
+          ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+                                       chunked_array_->chunk(chunk_location.chunk_index)),
+                                   chunk_location.index_in_chunk);
+      current_chunk_index_ = chunk_location.chunk_index;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ArrayIterator<ArrayType> Iterate(const Array& array) {
+  return stl::ArrayIterator<ArrayType>(&array);
+}
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}

Review Comment:
   Not ideal.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou closed pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array
URL: https://github.com/apache/arrow/pull/13009


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870491260


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   @bkietz Ok, I did that. It also turns out it's not very useful to memoize `current_array_iterator_` currently.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865030073


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -59,11 +62,12 @@ class ArrayIterator {
 
   // Value access
   value_type operator*() const {
-    return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+    return !array_ || array_->IsNull(index_) ? value_type{} : array_->GetView(index_);

Review Comment:
   I don't think dereferencing such an iterator should be a valid operation, though, so the check should not be needed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1118497444

   @AlvinJ15 Well, I think the error log explains it:
   https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/builds/43442933/job/21gmmvnmdmdjvsh5#L996
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865035604


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   The `arrow::internal::checked_pointer_cast<ArrayType>` when creating an ArrayIterator, is less expensive than `int64_t` comparissons and a arithmetic operation?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865026443


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -59,11 +62,12 @@ class ArrayIterator {
 
   // Value access
   value_type operator*() const {
-    return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+    return !array_ || array_->IsNull(index_) ? value_type{} : array_->GetView(index_);

Review Comment:
   When a `ChunkedArray` is empty, I initialize an `ArrayIterator` with the empty constructor which initialize the `array_` with a `NULLPTR`, and this extra condition is more feasible/short than adding more conditions on ChunkedArrayIterator for validate multiple cases.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865128767


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,201 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);
+}
+
+TEST(ChunkedArrayIterator, Arithmetic) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  auto it2 = it + 2;
+  ASSERT_EQ(*it, 4);
+  ASSERT_EQ(*it2, nullopt);
+  ASSERT_EQ(it2 - it, 2);
+  ASSERT_EQ(it - it2, -2);
+  auto it3 = it++;
+  ASSERT_EQ(it2 - it, 1);
+  ASSERT_EQ(it2 - it3, 2);
+  ASSERT_EQ(*it3, 4);
+  ASSERT_EQ(*it, 5);
+  auto it4 = ++it;
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(*it4, nullopt);
+  ASSERT_EQ(it2 - it, 0);
+  ASSERT_EQ(it2 - it4, 0);
+  auto it5 = it + 3;
+  ASSERT_EQ(*it5, 7);
+  ASSERT_EQ(*(it5 - 2), 6);
+  ASSERT_EQ(*(it5 + (-2)), 6);
+  auto it6 = (--it5)--;
+  ASSERT_EQ(*it6, nullopt);
+  ASSERT_EQ(*it5, 6);
+  ASSERT_EQ(it6 - it5, 1);
+}
+
+TEST(ChunkedArrayIterator, Comparison) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Iterate<Int32Type>(*result) + 2;
+  auto it2 = Iterate<Int32Type>(*result) + 2;
+  auto it3 = Iterate<Int32Type>(*result) + 4;
+
+  ASSERT_TRUE(it == it2);
+  ASSERT_TRUE(it <= it2);
+  ASSERT_TRUE(it >= it2);
+  ASSERT_FALSE(it != it2);
+  ASSERT_FALSE(it < it2);
+  ASSERT_FALSE(it > it2);
+
+  ASSERT_FALSE(it == it3);
+  ASSERT_TRUE(it <= it3);
+  ASSERT_FALSE(it >= it3);
+  ASSERT_TRUE(it != it3);
+  ASSERT_TRUE(it < it3);
+  ASSERT_FALSE(it > it3);
+}
+
+TEST(ChunkedArrayIterator, MultipleChunks) {
+  auto result =
+      ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])", R"([7, 9, 10, 8])",
+                                     R"([11, 13])", R"([14])", R"([15])", R"([16])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 11);
+  ASSERT_EQ(it[9], 13);
+  it += 3;
+  ASSERT_EQ(it[0], 6);
+  ++it;
+  ASSERT_EQ(it[0], 7);
+  ASSERT_EQ(*it, 7);
+  ASSERT_EQ(it[1], 9);
+  ASSERT_EQ(it[2], 10);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 6);
+  ASSERT_EQ(it[4], 7);
+  it += 9;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  --it;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+  ASSERT_EQ(it[3], 14);
+  ASSERT_EQ(it[4], 15);
+  ASSERT_EQ(it[5], 16);
+  ++it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 14);
+  ASSERT_EQ(it[3], 15);
+  ASSERT_EQ(it[4], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  ASSERT_EQ(it[3], 16);
+  ++it;
+  ++it;
+  ASSERT_EQ(*it, 15);
+  ASSERT_EQ(it[0], 15);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 14);
+  ASSERT_EQ(it[0], 14);
+  ASSERT_EQ(it[1], 15);
+  ASSERT_EQ(it[2], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  it -= 2;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+
+TEST(ChunkedArrayIterator, EmptyChunks) {
+  auto result = ChunkedArrayFromJSON(int32(), {});
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(it[0], nullopt);
+  it++;
+  ASSERT_EQ(it[0], nullopt);
+  it += 2;
+  ASSERT_EQ(it[1], nullopt);
+
+  result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([])", R"([7, 9, 10, 8])",
+                                          R"([11, 13])", R"([])", R"([])", R"([16])"});
+
+  it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 13);
+  ASSERT_EQ(it[9], 16);
+  it += 3;
+  ASSERT_EQ(it[0], 7);
+  ++it;
+  ASSERT_EQ(it[0], 9);
+  ASSERT_EQ(*it, 9);
+  ASSERT_EQ(it[1], 10);
+  ASSERT_EQ(it[2], 8);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 7);
+  ASSERT_EQ(it[4], 9);
+  it += 9;
+  ASSERT_EQ(*it, 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  ++it;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  it += 2;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  it -= 3;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+

Review Comment:
   1. At the beginning of the `EmptyChunks` it has and empty `ChunkedArray`
   ```
   TEST(ChunkedArrayIterator, EmptyChunks) {
     auto result = ChunkedArrayFromJSON(int32(), {});
     auto it = Iterate<Int32Type>(*result);
     ASSERT_EQ(*it, nullopt);
     ASSERT_EQ(it[0], nullopt);
   ```
   2. Added



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869545622


##########
cpp/src/arrow/chunked_array.h:
##########
@@ -36,6 +36,10 @@ namespace arrow {
 class Array;
 class DataType;
 class MemoryPool;
+namespace stl {
+template <typename T, typename V>
+class ChunkedArrayIterator;
+}

Review Comment:
   weird, doesn't the linter usually require `}  // namespace stl` here?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {

Review Comment:
   Should we DCHECK(chunked_array_) in most of these operations to guard against use of a default-initialized iterator?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   Maybe also DCHECK here and in the constructor that we aren't exceeding end()?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869971878


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }

Review Comment:
   Is used in the Test ChunkedArrayIterator.Arithmetic `ASSERT_EQ(it - it2, -2);`
   There is a similar method in ArrayIterator, for calculate the difference between two iterators(like a distance), I think make a  + operator  would not be used
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112406182

   I am not fully convinced but no need on continuing this rant. I will wait for this PR to be merged and try something out.
   Thanks for the all the great feedback!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859921831


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   [`ChunkedArray`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L145) already has `GetScalar(index)` which is efficient for accessing sequential scalars at particular logical indices. I would expect this iterator to make use of it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859956356


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    ++index_;
+    if (iterators_list_[current_chunk_index_].index() ==
+        static_cast<int64_t>(
+            chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+            1) {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_++;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+    } else {
+      iterators_list_[current_chunk_index_]++;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    --index_;
+    if (iterators_list_[current_chunk_index_].index()) {
+      iterators_list_[current_chunk_index_]--;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      current_chunk_index_--;
+      if (!static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length())) {
+        current_chunk_index_ = GetChunkIndex(index_);
+      }
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      iterators_list_[current_chunk_index_] +=
+          static_cast<int64_t>(
+              chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+          1;
+    }
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    if (n < static_cast<int64_t>(
+                chunked_array_->chunk(static_cast<int>(current_chunk_index_))->length()) -
+                iterators_list_[current_chunk_index_].index()) {
+      index_ += n;
+      iterators_list_[current_chunk_index_] += n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ += n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    if (n <= iterators_list_[current_chunk_index_].index()) {
+      index_ -= n;
+      iterators_list_[current_chunk_index_] -= n;
+      return *this;
+    } else {
+      iterators_list_[current_chunk_index_] -=
+          iterators_list_[current_chunk_index_].index();
+      index_ -= n;
+      InitializeComponents(*this, true);
+      return *this;
+    }
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  int64_t GetChunkIndex(int64_t index) const {
+    return static_cast<int64_t>(
+        std::upper_bound(chunks_lengths_.begin(), chunks_lengths_.end(), index) -
+        chunks_lengths_.begin());
+  }
+
+  void InitializeComponents(ChunkedArrayIterator<ArrayType>& chunked_array_iterator,

Review Comment:
   Maybe call this `UpdateComponents` as it doesn't only initialize but is also used for updates.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859953533


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   Ok, well what about adding a `GetView` method to `ChunkedArray` which invokes the `GetView` of the underlying `Arrays` and use the [`ChunkedResolver` to resolve the logical index](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.cc#L152-L157)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859959406


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   Well, ok I see that it is using the `Array` iterators.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r869635511


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {
+      auto chunk_location = GetChunkLocation(index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+/// Return an iterator to the beginning of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}
+
+/// Return an iterator to the end of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
+}

Review Comment:
   Nit: Sometimes `ChunkedArrayIterator<>` is used and other times `std::ChunkedArrayIterator<>`.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }

Review Comment:
   Is this method part of the public API of a standard iterator? or is it a helper method that can be private?
   I couldn't find where it is being used.
   
   Also, why is there not an equivalent method for addition?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870017986


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {

Review Comment:
   I added some `assert` for check it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r870292161


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,162 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    if (index_ != chunked_array.length()) {
+      auto chunk_location = GetChunkLocation(this->index_);
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          arrow::internal::checked_cast<const ArrayType&>(
+              *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          chunk_location.index_in_chunk);
+    } else {
+      current_array_iterator_ = {};
+    }
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    if (index_ != chunked_array_->length()) {

Review Comment:
   It seems that any value is valid as long as you can return to the initial value with its inverse operation. So the check `!=` follows the operation semantics of a RandomAccessIterator.
   Actually, the `<` check would be insufficient because it allows negative indices (need absolute value of index).
   
   The question now becomes, do Arrow iterators guard against out-of-bounds accesses?
   From a performance perspective, they have basically the same cost.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r859921831


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,215 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    InitializeComponents(*this);
+  }
+
+  // Value access
+  value_type operator*() const { return *iterators_list_[current_chunk_index_]; }
+
+  value_type operator[](difference_type n) const {
+    int64_t chunk_index = GetChunkIndex(index_ + n);
+    int64_t index_in_chunk =
+        chunk_index ? index_ + n - chunks_lengths_[chunk_index - 1] : index_ + n;
+    return iterators_list_[chunk_index]
+                          [index_in_chunk - iterators_list_[chunk_index].index()];

Review Comment:
   [`ChunkedArray` already has `GetScalar(index)`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L145) which is efficient for accessing sequential scalars at particular logical indices. I suggest to use it for the iteration. There is no need to perform complex index calculations/checks in the iterator. This would be somewhat analogous to the [`Array` iterator`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/stl_iterator.h#L61-L67).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860859259


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   I would expect iterator tests to have a loop from. For example, you can iterator through a ChunkedArray and store the values into a `std::vector` and then make assertions against the vector.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r860912317


##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,216 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto chunk0 = ArrayFromJSON(int32(), "[4, 5, null]");
+  auto chunk1 = ArrayFromJSON(int32(), "[6]");
+
+  ASSERT_OK_AND_ASSIGN(auto result, ChunkedArray::Make({chunk0, chunk1}, int32()));
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);

Review Comment:
   Ah I see now, https://en.cppreference.com/w/cpp/iterator/random_access_iterator includes "array notation with subscripting" (I was scanning the code snippets not the text).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1112256516

   Here are some observations on the `Array/ChunkedArray` iterator support in Arrow.
   
   1. For `ChunkedArray`, how would we know if the iterator is in the begin/end?
       I suggest we add begin/end methods to `ChunkedArray` ( [for example, see `NumericArray`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/array/array_primitive.h#L118-L120)) so that it can be traversed in for-loop idioms.
       Note: Currently, not all Arrow `Arrays` have begin/end methods.
       ```c++
       # Iterator
       for (auto it = chunked_array.begin(); it != chunked_array.end(); ++it) { ... }
   
       # STL for_each
       std::for_each(chunked_array.begin(), chunked_array.end(), [](T const& elem) { ... }
   
       # range-for
       for (const auto& elem: chunked_array) { ... }
       ```
   
   2. C++ has begin/end and their const variants cbegin/cend. In Arrow `Array` and `ChunkedArray` are immutable types, so I would expect for them to only support cbegin/cend, but range-for requires begin/end. A solution is to have begin/end return a const iterator.
   
   3. There is an [experimental `MultipleChunkIterator`](https://github.com/apache/arrow/blob/master/cpp/src/arrow/chunked_array.h#L194) that seems to not be used in the codebase. Should it be removed?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r864576556


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -59,11 +62,12 @@ class ArrayIterator {
 
   // Value access
   value_type operator*() const {
-    return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+    return !array_ || array_->IsNull(index_) ? value_type{} : array_->GetView(index_);

Review Comment:
   Is there any circumstance where `array_` is null and you're still calling this operator? This doesn't seem right.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }

Review Comment:
   This is the one place where `current_array_iterator_` could be useful for performance but you're not using it here?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   Again, I don't think this `if` branch is needed as constructing an `ArrayIterator` should be cheap.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   I don't think this `if` branch is needed, constructing a `ArrayIterator` should be cheap.



##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,201 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);
+}
+
+TEST(ChunkedArrayIterator, Arithmetic) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  auto it2 = it + 2;
+  ASSERT_EQ(*it, 4);
+  ASSERT_EQ(*it2, nullopt);
+  ASSERT_EQ(it2 - it, 2);
+  ASSERT_EQ(it - it2, -2);
+  auto it3 = it++;
+  ASSERT_EQ(it2 - it, 1);
+  ASSERT_EQ(it2 - it3, 2);
+  ASSERT_EQ(*it3, 4);
+  ASSERT_EQ(*it, 5);
+  auto it4 = ++it;
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(*it4, nullopt);
+  ASSERT_EQ(it2 - it, 0);
+  ASSERT_EQ(it2 - it4, 0);
+  auto it5 = it + 3;
+  ASSERT_EQ(*it5, 7);
+  ASSERT_EQ(*(it5 - 2), 6);
+  ASSERT_EQ(*(it5 + (-2)), 6);
+  auto it6 = (--it5)--;
+  ASSERT_EQ(*it6, nullopt);
+  ASSERT_EQ(*it5, 6);
+  ASSERT_EQ(it6 - it5, 1);
+}
+
+TEST(ChunkedArrayIterator, Comparison) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 7])"});
+
+  auto it = Iterate<Int32Type>(*result) + 2;
+  auto it2 = Iterate<Int32Type>(*result) + 2;
+  auto it3 = Iterate<Int32Type>(*result) + 4;
+
+  ASSERT_TRUE(it == it2);
+  ASSERT_TRUE(it <= it2);
+  ASSERT_TRUE(it >= it2);
+  ASSERT_FALSE(it != it2);
+  ASSERT_FALSE(it < it2);
+  ASSERT_FALSE(it > it2);
+
+  ASSERT_FALSE(it == it3);
+  ASSERT_TRUE(it <= it3);
+  ASSERT_FALSE(it >= it3);
+  ASSERT_TRUE(it != it3);
+  ASSERT_TRUE(it < it3);
+  ASSERT_FALSE(it > it3);
+}
+
+TEST(ChunkedArrayIterator, MultipleChunks) {
+  auto result =
+      ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])", R"([7, 9, 10, 8])",
+                                     R"([11, 13])", R"([14])", R"([15])", R"([16])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 11);
+  ASSERT_EQ(it[9], 13);
+  it += 3;
+  ASSERT_EQ(it[0], 6);
+  ++it;
+  ASSERT_EQ(it[0], 7);
+  ASSERT_EQ(*it, 7);
+  ASSERT_EQ(it[1], 9);
+  ASSERT_EQ(it[2], 10);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 6);
+  ASSERT_EQ(it[4], 7);
+  it += 9;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  --it;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+  ASSERT_EQ(it[3], 14);
+  ASSERT_EQ(it[4], 15);
+  ASSERT_EQ(it[5], 16);
+  ++it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 14);
+  ASSERT_EQ(it[3], 15);
+  ASSERT_EQ(it[4], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  ASSERT_EQ(it[3], 16);
+  ++it;
+  ++it;
+  ASSERT_EQ(*it, 15);
+  ASSERT_EQ(it[0], 15);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 14);
+  ASSERT_EQ(it[0], 14);
+  ASSERT_EQ(it[1], 15);
+  ASSERT_EQ(it[2], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  it -= 2;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+
+TEST(ChunkedArrayIterator, EmptyChunks) {
+  auto result = ChunkedArrayFromJSON(int32(), {});
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(it[0], nullopt);
+  it++;
+  ASSERT_EQ(it[0], nullopt);
+  it += 2;
+  ASSERT_EQ(it[1], nullopt);
+
+  result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([])", R"([7, 9, 10, 8])",
+                                          R"([11, 13])", R"([])", R"([])", R"([16])"});
+
+  it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 13);
+  ASSERT_EQ(it[9], 16);
+  it += 3;
+  ASSERT_EQ(it[0], 7);
+  ++it;
+  ASSERT_EQ(it[0], 9);
+  ASSERT_EQ(*it, 9);
+  ASSERT_EQ(it[1], 10);
+  ASSERT_EQ(it[2], 8);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 7);
+  ASSERT_EQ(it[4], 9);
+  it += 9;
+  ASSERT_EQ(*it, 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  ++it;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  it += 2;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  it -= 3;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+

Review Comment:
   Two things:
   1) can you add tests with an empty chunked array?
   2) can you add tests with some standard algorithms as done above for `ArrayIterator`?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);

Review Comment:
   This seems more logical:
   ```suggestion
         current_array_iterator_ = ArrayIterator<ArrayType>(
             *arrow::internal::checked_pointer_cast<ArrayType>(
                 chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
             chunk_location.index_in_chunk);
   ```
   ... and then you don't need the subtraction below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r864575696


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +131,148 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    current_array_iterator_ =
+        ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+            chunked_array_->chunk(chunk_location.chunk_index)), index_);
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -=
+        this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(chunk_location.chunk_index))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      current_array_iterator_ -=
+          current_array_iterator_.index() - chunk_location.index_in_chunk;
+    } else {
+      current_array_iterator_ =
+          ArrayIterator<ArrayType>(*arrow::internal::checked_pointer_cast<ArrayType>(
+                                       chunked_array_->chunk(chunk_location.chunk_index)),
+                                   chunk_location.index_in_chunk);
+      current_chunk_index_ = chunk_location.chunk_index;
+    }
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+  int64_t current_chunk_index_;
+  ArrayIterator<ArrayType> current_array_iterator_;
+};
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ArrayIterator<ArrayType> Iterate(const Array& array) {
+  return stl::ArrayIterator<ArrayType>(&array);
+}
+
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
+}

Review Comment:
   @edponce Iterating is what this API is meant for. We have a separate `Iterator` concept in Arrow C++ that we don't want to mix up with this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on a diff in pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r865043510


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }

Review Comment:
   Extra conditions would have to be added to control overflows, so I better call the `operator +=` which already has the necessary conditions, and also the ChunkResolver implementation managing a cache which avoids repeated searches when it is in the same chunk



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] edponce commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
edponce commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1113736711

   A solution is to have the factory `MakeIterator` not be templated and check the type of the `chunked_array` at runtime, and use a switch-case to dispatch the correct `ChunkedArrayIterator`.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] AlvinJ15 commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
AlvinJ15 commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1118097502

   @lidavidm @pitrou @edponce How can I check why it is failing just in Windows?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #13009: ARROW-602: [C++] Provide iterator access to primitive elements inside an Array

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #13009:
URL: https://github.com/apache/arrow/pull/13009#issuecomment-1118508742

   Hmm, I think my `Iterate` suggestion was a bit too vague. To satisfy the requirements for iteration and for call STL algorithms, we need "begin" and "end" iterators. Basically, I would suggest something like:
   ```c++
   /// Return an iterator to the beginning of the chunked array
   template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
   ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
     return stl::ChunkedArrayIterator<ArrayType>(chunked_array);
   }
   
   /// Return an iterator to the end of the chunked array
   template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
   ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
     return stl::ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
   }
   
   template <typename ArrayType>
   struct ChunkedArrayRange {
     const ChunkedArray* chunked_array;
   
     ChunkedArrayIterator<ArrayType> begin() {
       return stl::ChunkedArrayIterator<ArrayType>(*chunked_array);
     }
     ChunkedArrayIterator<ArrayType> end() {
       return stl::ChunkedArrayIterator<ArrayType>(*chunked_array, chunked_array.length());
     }
   };
   
   /// Return an iterable range over the chunked array
   ///
   /// This makes it easy to use a chunked array in a for-range construct.
   template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
   ChunkedArrayRange<ArrayType> Iterate(const ChunkedArray& chunked_array) {
     return stl::ChunkedArrayRange<ArrayType>(&chunked_array);
   }
   ```
   
   This way you can write:
   ```c++
   for (util::optional<int64_t> : Iterate<Int64Type>(chunked_array)) {
     ...
   ```
   
   as well as:
   ```c++
   // Find the first item that's non-null and at least 42
   auto it = std::find_if(
       Begin<Int64Type>(chunked_array), End<Int64Type>(chunked_array),
       [util::optional<int64_t> item]() { return item.has_value() && *item >= 42});
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org