You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/10/25 18:29:13 UTC

[GitHub] [arrow] WillAyd opened a new pull request, #14505: Initial Rank Implementation for ChunkedArray

WillAyd opened a new pull request, #14505:
URL: https://github.com/apache/arrow/pull/14505

   Initial implementation - could use some guidance on a few points


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019313438


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,261 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement,
+                     const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    auto resolver = ChunkedArrayResolver(arrays);
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end, resolver,
+                                                null_count, null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;

Review Comment:
   (or a template callable arg of some kind; do note that we use C++17 now so metaprogramming becomes more comfortable :-))



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1006264959


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end,
+                                                ChunkedArrayResolver(arrays), null_count,
+                                                null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank;
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T curr_value, prev_value{};
+        rank = 0;
+
+        if (null_placement_ == NullPlacement::AtStart && sorted[0].null_count() > 0) {
+          rank++;
+          for (auto it = sorted[0].nulls_begin; it < sorted[0].nulls_end; it++) {
+            out_begin[*it] = rank;
+          }
+        }
+
+        for (auto it = sorted[0].non_nulls_begin; it < sorted[0].non_nulls_end; it++) {
+          // Below code wasn't working for string specialization as -> value returned a buffer
+          // but T is a basic_string_view
+          // using ScalarType = typename TypeTraits<InType>::ScalarType;
+          // auto scalar = std::dynamic_pointer_cast<ScalarType>(
+          //   chunked_array_.GetScalar(*it).ValueOrDie());
+          // curr_value = scalar->value;
+          // TODO: can we use chunk_resolver_ from chunked array externally?          
+          if (*it >= 2) {

Review Comment:
   My current best guess is to execute `take` with the indices and iterate over that result. I don't know that I've seen that pattern internally yet so happy to go a different route if that is too heavy handed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd closed pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by "WillAyd (via GitHub)" <gi...@apache.org>.
WillAyd closed pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays
URL: https://github.com/apache/arrow/pull/14505


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1310335239

   Hey - I'll try to get to this soon but it may be a few more days


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1059187675


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   I've been jumping through the `ArraySorter` a bit and am not sure of a really clean way to make that work. I looked at both templating the existing `ArraySortFunc` as well as creating a new `GetChunkedArraySorter` factory, but I think both are limited by the fact that `ArraySortFunc` requires an `Array& argument`, so would have to overhaul things significantly to share code.
   
   The other caveat is that the `ChunkedArraySorter` and `TableSorter` implementations right now both internally manage their own implementations to merge the `NullPartitionResult` vector. 
   
   I'm thinking of creating a visitor that returns the sorted NullPartitionResult, though even then am a bit unsure as the ChunkedArray / Table sort classes currently write the sorted output array as they step through their internal merge implementations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1307661130

   @pitrou @lidavidm if either of you have time to review would be appreciated as always. I think the CI failures are unrelated


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019265314


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,261 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement,
+                     const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    auto resolver = ChunkedArrayResolver(arrays);
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end, resolver,
+                                                null_count, null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;

Review Comment:
   I wonder if it's possible to share this with the contiguous array rank implementation.
   IIUC the only different thing is value access. It should be doable to factor that out in a template class?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019308073


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   I might try a precursor to this to refactor and make the NullPartition result easily accessible to both Sort and Rank implementations unless you disagree



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1004837884


##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2182,5 +2185,24 @@ TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
       ArrayFromJSON(binary_type, R"(["aaa", "   ", "eee", null, "eee", null, "   "])"));
 }
 
+
+TEST(TestRankForReal, RankRealChunked) {

Review Comment:
   Here I have created the Chunk test as a separate item, which duplicates all of the looping logic. If we didn't want to do that, could also just implement the ChunkedArray constructors in the existing tests. 
   
   I think most tests go with this pattern, so happy to continue and copy over all of the rest of the type tests if that's what we want



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   This and the next 20 lines of merging code is copied from the ChunkedArraySort implementation. Could refactor to make it a function each class could call instead I think



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end,
+                                                ChunkedArrayResolver(arrays), null_count,
+                                                null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank;
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T curr_value, prev_value{};
+        rank = 0;
+
+        if (null_placement_ == NullPlacement::AtStart && sorted[0].null_count() > 0) {
+          rank++;
+          for (auto it = sorted[0].nulls_begin; it < sorted[0].nulls_end; it++) {
+            out_begin[*it] = rank;
+          }
+        }
+
+        for (auto it = sorted[0].non_nulls_begin; it < sorted[0].non_nulls_end; it++) {
+          // Below code wasn't working for string specialization as -> value returned a buffer
+          // but T is a basic_string_view
+          // using ScalarType = typename TypeTraits<InType>::ScalarType;
+          // auto scalar = std::dynamic_pointer_cast<ScalarType>(
+          //   chunked_array_.GetScalar(*it).ValueOrDie());
+          // curr_value = scalar->value;
+          // TODO: can we use chunk_resolver_ from chunked array externally?          
+          if (*it >= 2) {

Review Comment:
   This is hacked together just to work for the provided test case but obviously needs a better implementation. I wasn't sure what the best approach was to get the logical value from a particular index in a ChunkedArray. Alternately we can bypass the ChunkedArray and try to use the `arrays` variable, but I don't know that that would be any cleaner



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end,
+                                                ChunkedArrayResolver(arrays), null_count,
+                                                null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank;
+
+    switch (tiebreaker_) {

Review Comment:
   Except for nuance around how to retrieve a value from an array at a particular index in a performant manner, this Rank algorithm is copy / paste from the `ArrayRanker` class, so likely can refactor into a function outside of the classes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by "WillAyd (via GitHub)" <gi...@apache.org>.
WillAyd commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1403936332

   Superseded by #33846


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1006264959


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end,
+                                                ChunkedArrayResolver(arrays), null_count,
+                                                null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank;
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T curr_value, prev_value{};
+        rank = 0;
+
+        if (null_placement_ == NullPlacement::AtStart && sorted[0].null_count() > 0) {
+          rank++;
+          for (auto it = sorted[0].nulls_begin; it < sorted[0].nulls_end; it++) {
+            out_begin[*it] = rank;
+          }
+        }
+
+        for (auto it = sorted[0].non_nulls_begin; it < sorted[0].non_nulls_end; it++) {
+          // Below code wasn't working for string specialization as -> value returned a buffer
+          // but T is a basic_string_view
+          // using ScalarType = typename TypeTraits<InType>::ScalarType;
+          // auto scalar = std::dynamic_pointer_cast<ScalarType>(
+          //   chunked_array_.GetScalar(*it).ValueOrDie());
+          // curr_value = scalar->value;
+          // TODO: can we use chunk_resolver_ from chunked array externally?          
+          if (*it >= 2) {

Review Comment:
   My current best guess is to execute `take` with the indices and iterate over that result via `ChunkedArrayIterator`. I don't know that I've seen that pattern internally yet so happy to go a different route if that is too heavy handed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019312562


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   That sounds like a good idea. Perhaps an intermediate abstraction such as ArraySorter?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019255519


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   It would be nice indeed. Or perhaps you can even call `ChunkedArraySort` directly?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1033005185


##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2182,5 +2185,146 @@ TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
       ArrayFromJSON(binary_type, R"(["aaa", "   ", "eee", null, "eee", null, "   "])"));
 }
 
+TEST(TestRankForReal, RankRealChunked) {
+  for (auto real_type : ::arrow::FloatingPointTypes()) {
+    for (auto null_placement : AllNullPlacements()) {
+      for (auto tiebreaker : AllTiebreakers()) {
+        for (auto order : AllOrders()) {
+          AssertRankEmpty(real_type, order, null_placement, tiebreaker);
+        }
+
+        AssertRankSimple(
+            ChunkedArrayFromJSON(real_type, {"[2.1, 3.2]", "[1.0, 0.0, 5.5]"}),
+            null_placement, tiebreaker);
+      }
+    }
+    AssertRankAllTiebreakers(
+        ChunkedArrayFromJSON(real_type, {"[1.2, 0.0]", "[5.3, null, 5.3, null, 0.0]"}));
+  }
+}
+
+TEST(TestRankForIntegral, RankIntegralChunked) {

Review Comment:
   I think `AssertRankEmpty` already does this but let me know if you had something else in mind



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1349287137

   Hoping to look at this soon - sorry for having forgotten about it.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1033005300


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,261 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement,
+                     const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments
+    std::vector<NullPartitionResult> sorted(num_chunks);
+    int64_t begin_offset = 0;
+    int64_t end_offset = 0;
+    int64_t null_count = 0;
+    for (int i = 0; i < num_chunks; ++i) {
+      const auto array = checked_cast<const ArrayType*>(arrays[i]);
+      end_offset += array->length();
+      null_count += array->null_count();
+      sorted[i] = array_sorter(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+                               *array, begin_offset, array_options);
+      begin_offset = end_offset;
+    }
+    DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+    auto resolver = ChunkedArrayResolver(arrays);
+    if (sorted.size() > 1) {
+      auto merge_nulls = [&](uint64_t* nulls_begin, uint64_t* nulls_middle,
+                             uint64_t* nulls_end, uint64_t* temp_indices,
+                             int64_t null_count) {
+        if (has_null_like_values<typename ArrayType::TypeClass>::value) {
+          PartitionNullsOnly<StablePartitioner>(nulls_begin, nulls_end, resolver,
+                                                null_count, null_placement_);
+        }
+      };
+      auto merge_non_nulls = [&](uint64_t* range_begin, uint64_t* range_middle,
+                                 uint64_t* range_end, uint64_t* temp_indices) {
+        MergeNonNulls<ArrayType>(range_begin, range_middle, range_end, arrays,
+                                 temp_indices);
+      };
+
+      MergeImpl merge_impl{null_placement_, std::move(merge_nulls),
+                           std::move(merge_non_nulls)};
+      // std::merge is only called on non-null values, so size temp indices accordingly
+      RETURN_NOT_OK(merge_impl.Init(ctx_, indices_end_ - indices_begin_ - null_count));
+
+      while (sorted.size() > 1) {
+        auto out_it = sorted.begin();
+        auto it = sorted.begin();
+        while (it < sorted.end() - 1) {
+          const auto& left = *it++;
+          const auto& right = *it++;
+          DCHECK_EQ(left.overall_end(), right.overall_begin());
+          const auto merged = merge_impl.Merge(left, right, null_count);
+          *out_it++ = merged;
+        }
+        if (it < sorted.end()) {
+          *out_it++ = *it++;
+        }
+        sorted.erase(out_it, sorted.end());
+      }
+    }
+
+    DCHECK_EQ(sorted.size(), 1);
+    DCHECK_EQ(sorted[0].overall_begin(), indices_begin_);
+    DCHECK_EQ(sorted[0].overall_end(), indices_end_);
+    // Note that "nulls" can also include NaNs, hence the >= check
+    DCHECK_GE(sorted[0].null_count(), null_count);
+
+    auto length = indices_end_ - indices_begin_;

Review Comment:
   Tried doing a template callable but didn't seem to work well with the required capture clauses. Just ended up passing the function as an argument to the newly created `CreateRankings` function



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1291066800

   :warning: Ticket **has not been started in JIRA**, please click 'Start Progress'.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1291066765

   https://issues.apache.org/jira/browse/ARROW-16707


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019266032


##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -2182,5 +2185,146 @@ TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
       ArrayFromJSON(binary_type, R"(["aaa", "   ", "eee", null, "eee", null, "   "])"));
 }
 
+TEST(TestRankForReal, RankRealChunked) {
+  for (auto real_type : ::arrow::FloatingPointTypes()) {
+    for (auto null_placement : AllNullPlacements()) {
+      for (auto tiebreaker : AllTiebreakers()) {
+        for (auto order : AllOrders()) {
+          AssertRankEmpty(real_type, order, null_placement, tiebreaker);
+        }
+
+        AssertRankSimple(
+            ChunkedArrayFromJSON(real_type, {"[2.1, 3.2]", "[1.0, 0.0, 5.5]"}),
+            null_placement, tiebreaker);
+      }
+    }
+    AssertRankAllTiebreakers(
+        ChunkedArrayFromJSON(real_type, {"[1.2, 0.0]", "[5.3, null, 5.3, null, 0.0]"}));
+  }
+}
+
+TEST(TestRankForIntegral, RankIntegralChunked) {

Review Comment:
   Can you add some tests for zero chunks and/or empty chunks?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1019285497


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   Yea was unsure how to best structure that. I could call ChunkedArraySort directly but then lose access to the NullPartitionResults which the ranking mechanism uses to assign ranks



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] WillAyd commented on pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
WillAyd commented on PR #14505:
URL: https://github.com/apache/arrow/pull/14505#issuecomment-1349597723

   No problem at all - I'll be on vacation the next 5 days anyway so no rush from my end
   
   I'm still not very happy with the copy / paste of the ChunkedArraySort and merge code. I have been concurrently trying a few things with the `ArraySort` implementation to make that more generalizable just haven't found the right solution yet


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] pitrou commented on a diff in pull request #14505: ARROW-16707 [C++] Implement Rank kernel on chunked arrays

Posted by GitBox <gi...@apache.org>.
pitrou commented on code in PR #14505:
URL: https://github.com/apache/arrow/pull/14505#discussion_r1071313595


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -2078,6 +2078,293 @@ class ArrayRanker : public TypeVisitor {
   Datum* output_;
 };
 
+class ChunkedArrayRanker : public TypeVisitor {
+ public:
+  // TODO: here we accept order / null_placement / tiebreaker as separate arguments
+  // whereas the ArrayRanker accepts them as the RankOptions struct; this is consistent
+  // with ArraySorter / ChunkedArraySorter, so likely should refactor ArrayRanker
+  ChunkedArrayRanker(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+                     const ChunkedArray& chunked_array, const SortOrder order,
+                     const NullPlacement null_placement, const RankOptions::Tiebreaker tiebreaker, Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        indices_begin_(indices_begin),
+        indices_end_(indices_end),
+        chunked_array_(chunked_array),
+        physical_type_(GetPhysicalType(chunked_array.type())),
+        physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+        order_(order),
+        null_placement_(null_placement),
+        tiebreaker_(tiebreaker),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    const auto num_chunks = chunked_array_.num_chunks();
+    if (num_chunks == 0) {
+      return Status::OK();
+    }
+    const auto arrays = GetArrayPointers(physical_chunks_);
+
+    ArraySortOptions array_options(order_, null_placement_);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    // See related ChunkedArraySort method for comments

Review Comment:
   Sorry for the delay. Perhaps you can simply turn:
   ```c++
   Status SortChunkedArray(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
                           const ChunkedArray& values, SortOrder sort_order,
                           NullPlacement null_placement);
   ```
   into:
   ```c++
   Result<NullPartitionResult> SortChunkedArray(ExecContext* ctx,
                           uint64_t* indices_begin, uint64_t* indices_end,
                           const ChunkedArray& values, SortOrder sort_order,
                           NullPlacement null_placement);
   ```
   ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org