You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/19 10:43:58 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12963: ARROW-16234: [C++] Vector Kernel for Rank

pitrou commented on code in PR #12963:
URL: https://github.com/apache/arrow/pull/12963#discussion_r876866655


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));

Review Comment:
   Null entries can have arbitrary physical values behind them, so we should definitely not take them into account. This means you must write two separate loops: one on null entries, one on non-null entries. (it will also make the loop themselves simpler than this one)



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank++;
+          }
+
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::First: {
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          out_begin[*it] = ++rank;
+        }
+        break;
+      }
+      case RankOptions::Min: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank = (it - sorted.overall_begin()) + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::Max: {
+        T currValue, prevValue;
+        auto rank = sorted.overall_end() - sorted.overall_begin();
+        for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); it--) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if ((it < sorted.overall_end() - 1 && (currValue != prevValue)) ||
+              it == sorted.nulls_begin - 1 || it == sorted.nulls_end - 1) {
+            rank = it - sorted.overall_begin() + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+    }
+
+    *output_ = Datum(rankings);
+    return Status::OK();
+  }
+
+  ExecContext* ctx_;
+  const Array& array_;
+  SortOrder order_;
+  NullPlacement null_placement_;
+  RankOptions::Tiebreaker tiebreaker_;

Review Comment:
   These can be made `const` as well.



##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -174,6 +174,25 @@ class ARROW_EXPORT SelectKOptions : public FunctionOptions {
   std::vector<SortKey> sort_keys;
 };
 
+/// \brief Rank options
+class ARROW_EXPORT RankOptions : public FunctionOptions {
+ public:
+  enum Tiebreaker { Min, Max, First, Dense };
+
+  explicit RankOptions(SortOrder order = SortOrder::Ascending,
+                       NullPlacement null_placement = NullPlacement::AtEnd,
+                       Tiebreaker = RankOptions::First);
+  static constexpr char const kTypeName[] = "RankOptions";
+  static RankOptions Defaults() { return RankOptions(); }
+
+  /// Sorting order
+  SortOrder order;

Review Comment:
   Do we want to be forward-looking? If at some point we want to implement multi-column ranking just like we have multi-column sorting, we may want this to be `std::vector<SortKey> sort_keys` instead.
   
   We can still keep the current constructor which would initialize a single sort key with an empty field name, since that's sufficient for arrays.
   



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank++;
+          }
+
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::First: {
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          out_begin[*it] = ++rank;
+        }
+        break;
+      }
+      case RankOptions::Min: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank = (it - sorted.overall_begin()) + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::Max: {
+        T currValue, prevValue;
+        auto rank = sorted.overall_end() - sorted.overall_begin();
+        for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); it--) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if ((it < sorted.overall_end() - 1 && (currValue != prevValue)) ||
+              it == sorted.nulls_begin - 1 || it == sorted.nulls_end - 1) {
+            rank = it - sorted.overall_begin() + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+    }
+
+    *output_ = Datum(rankings);
+    return Status::OK();
+  }
+
+  ExecContext* ctx_;
+  const Array& array_;
+  SortOrder order_;
+  NullPlacement null_placement_;
+  RankOptions::Tiebreaker tiebreaker_;
+  const std::shared_ptr<DataType> physical_type_;
+  Datum* output_;
+};
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",

Review Comment:
   Just a suggestion for improvement, feel free to disregard.
   ```suggestion
       "Compute numerical ranks of an array (1-based)",
   ```



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;

Review Comment:
   Styling: we use lowercase for local variables, so `curr_value` and `prev_value` for example.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank++;
+          }
+
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::First: {
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          out_begin[*it] = ++rank;
+        }
+        break;
+      }
+      case RankOptions::Min: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if (rank == 0 || it == sorted.nulls_begin || it == sorted.nulls_end ||
+              currValue != prevValue) {
+            rank = (it - sorted.overall_begin()) + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+      case RankOptions::Max: {
+        T currValue, prevValue;
+        auto rank = sorted.overall_end() - sorted.overall_begin();
+        for (auto it = sorted.overall_end() - 1; it >= sorted.overall_begin(); it--) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));
+          if ((it < sorted.overall_end() - 1 && (currValue != prevValue)) ||
+              it == sorted.nulls_begin - 1 || it == sorted.nulls_end - 1) {
+            rank = it - sorted.overall_begin() + 1;
+          }
+          out_begin[*it] = rank;
+          prevValue = currValue;
+        }
+        break;
+      }
+    }
+
+    *output_ = Datum(rankings);
+    return Status::OK();
+  }
+
+  ExecContext* ctx_;
+  const Array& array_;
+  SortOrder order_;
+  NullPlacement null_placement_;
+  RankOptions::Tiebreaker tiebreaker_;
+  const std::shared_ptr<DataType> physical_type_;
+  Datum* output_;
+};
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"

Review Comment:
   Nit: add stop at end of sentence.
   ```suggestion
        "than null values. The default tiebreaker is to assign ranks in order of\n"
        "when ties appear in the input.\n"
   ```



##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -1936,6 +1940,399 @@ TEST_P(TestTableSortIndicesRandom, Sort) {
   }
 }
 
+// ----------------------------------------------------------------------
+// Tests for Rank
+
+template <typename T>
+void AssertRank(const std::shared_ptr<T>& input, SortOrder order,
+                NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker,
+                const std::shared_ptr<Array>& expected) {
+  RankOptions options(order, null_placement, tiebreaker);
+  ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank", {input}, &options));
+  AssertDatumsEqual(expected, actual, /*verbose=*/true);
+}
+
+void AssertRank(const std::shared_ptr<DataType>& type, const std::string& values,
+                SortOrder order, NullPlacement null_placement,
+                RankOptions::Tiebreaker tiebreaker, const std::string& expected) {
+  AssertRank(ArrayFromJSON(type, values), order, null_placement, tiebreaker,
+             ArrayFromJSON(uint64(), expected));
+}
+
+class TestRankBase : public ::testing::Test {
+ public:
+  virtual std::shared_ptr<DataType> type() = 0;
+
+  virtual void AssertRank(const std::string& values, SortOrder order,
+                          NullPlacement null_placement,
+                          RankOptions::Tiebreaker tiebreaker,
+                          const std::string& expected) {
+    arrow::compute::AssertRank(this->type(), values, order, null_placement, tiebreaker,
+                               expected);
+  }
+};
+
+template <typename ArrowType>
+class TestRank : public TestRankBase {
+ public:
+  std::shared_ptr<DataType> type() override {
+    // Will choose default parameters for temporal types
+    return std::make_shared<ArrowType>();
+  }
+};
+
+template <typename ArrowType>
+class TestRankForReal : public TestRank<ArrowType> {};
+TYPED_TEST_SUITE(TestRankForReal, RealArrowTypes);
+
+template <typename ArrowType>
+class TestRankForBool : public TestRank<ArrowType> {};
+TYPED_TEST_SUITE(TestRankForBool, ::testing::Types<BooleanType>);
+
+template <typename ArrowType>
+class TestRankForIntegral : public TestRank<ArrowType> {};
+TYPED_TEST_SUITE(TestRankForIntegral, IntegralArrowTypes);
+
+template <typename ArrowType>
+class TestRankForTemporal : public TestRank<ArrowType> {};
+TYPED_TEST_SUITE(TestRankForTemporal, TemporalArrowTypes);
+
+template <typename ArrowType>
+class TestRankForStrings : public TestRank<ArrowType> {};
+TYPED_TEST_SUITE(TestRankForStrings, StringSortTestTypes);
+
+class TestRankForFixedSizeBinary : public TestRankBase {
+ public:
+  std::shared_ptr<DataType> type() override { return fixed_size_binary(3); }
+};
+
+TYPED_TEST(TestRankForReal, RankReal) {

Review Comment:
   I know we use type-parameterized tests a lot, but we should refrain from adding too many of them especially when the actual test code doesn't vary (the type parameter appears nowhere below AFAICT), because the compilation time for tests is not trivial.
   
   The nice thing with JSON-specified test values is that the same code can be run just with a different `shared_ptr<DataType>` argument (and possibly a different JSON string, if we want to get sophisticated).
   
   So there are two possible alternatives:
   * use value-parameterized tests, with the various `shared_ptr<DataType>` instances as parameter
   * use a simple for loop on supported type instances
   



##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -174,6 +174,25 @@ class ARROW_EXPORT SelectKOptions : public FunctionOptions {
   std::vector<SortKey> sort_keys;
 };
 
+/// \brief Rank options
+class ARROW_EXPORT RankOptions : public FunctionOptions {
+ public:
+  enum Tiebreaker { Min, Max, First, Dense };
+
+  explicit RankOptions(SortOrder order = SortOrder::Ascending,
+                       NullPlacement null_placement = NullPlacement::AtEnd,
+                       Tiebreaker = RankOptions::First);
+  static constexpr char const kTypeName[] = "RankOptions";
+  static RankOptions Defaults() { return RankOptions(); }
+
+  /// Sorting order
+  SortOrder order;

Review Comment:
   @lidavidm @kou What is your opinion on this?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));

Review Comment:
   Note you may also have to differentiate between `NullPlacement::AtStart` and `NullPlacement::AtEnd`.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,168 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+  ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+              Datum* output)
+      : TypeVisitor(),
+        ctx_(ctx),
+        array_(array),
+        order_(options.order),
+        null_placement_(options.null_placement),
+        tiebreaker_(options.tiebreaker),
+        physical_type_(GetPhysicalType(array.type())),
+        output_(output) {}
+
+  Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+  Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+  VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+  template <typename InType>
+  Status RankInternal() {
+    using GetView = GetViewType<InType>;
+    using T = typename GetViewType<InType>::T;
+    using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+    ArrayType arr(array_.data());
+    ArraySortOptions array_options(order_, null_placement_);
+
+    auto length = array_.length();
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+    auto sort_end = sort_begin + length;
+    std::iota(sort_begin, sort_end, 0);
+
+    ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+    NullPartitionResult sorted =
+        array_sorter(sort_begin, sort_end, arr, 0, array_options);
+    uint64_t rank = 0;
+
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+    auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+    switch (tiebreaker_) {
+      case RankOptions::Dense: {
+        T currValue, prevValue;
+        for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+          currValue = GetView::LogicalValue(arr.GetView(*it));

Review Comment:
   (these comments need not apply to `RankOptions::First` obviously)



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}

Review Comment:
   Yes, I don't think a static member would make this easier.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org