You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/06/01 13:21:43 UTC
[arrow] branch master updated: ARROW-16234: [C++] Vector Kernel for Rank (#12963)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2dc64563bc ARROW-16234: [C++] Vector Kernel for Rank (#12963)
2dc64563bc is described below
commit 2dc64563bcd0d9774fd69d271035940653203126
Author: William Ayd <wi...@icloud.com>
AuthorDate: Wed Jun 1 06:21:33 2022 -0700
ARROW-16234: [C++] Vector Kernel for Rank (#12963)
Lead-authored-by: Will Ayd <wi...@icloud.com>
Co-authored-by: William Ayd <wi...@icloud.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/compute/api_vector.cc | 33 ++++
cpp/src/arrow/compute/api_vector.h | 37 ++++
cpp/src/arrow/compute/kernels/vector_sort.cc | 216 ++++++++++++++++++++
cpp/src/arrow/compute/kernels/vector_sort_test.cc | 227 ++++++++++++++++++++++
docs/source/cpp/compute.rst | 13 +-
5 files changed, 522 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index e3db022536..ad4248fc6c 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -43,6 +43,7 @@ namespace internal {
using compute::DictionaryEncodeOptions;
using compute::FilterOptions;
using compute::NullPlacement;
+using compute::RankOptions;
template <>
struct EnumTraits<FilterOptions::NullSelectionBehavior>
@@ -88,6 +89,25 @@ struct EnumTraits<NullPlacement>
return "<INVALID>";
}
};
+template <>
+struct EnumTraits<RankOptions::Tiebreaker>
+ : BasicEnumTraits<RankOptions::Tiebreaker, RankOptions::Min, RankOptions::Max,
+ RankOptions::First, RankOptions::Dense> {
+ static std::string name() { return "Tiebreaker"; }
+ static std::string value_name(RankOptions::Tiebreaker value) {
+ switch (value) {
+ case RankOptions::Min:
+ return "Min";
+ case RankOptions::Max:
+ return "Max";
+ case RankOptions::First:
+ return "First";
+ case RankOptions::Dense:
+ return "Dense";
+ }
+ return "<INVALID>";
+ }
+};
} // namespace internal
@@ -139,6 +159,10 @@ static auto kCumulativeSumOptionsType = GetFunctionOptionsType<CumulativeSumOpti
DataMember("start", &CumulativeSumOptions::start),
DataMember("skip_nulls", &CumulativeSumOptions::skip_nulls),
DataMember("check_overflow", &CumulativeSumOptions::check_overflow));
+static auto kRankOptionsType = GetFunctionOptionsType<RankOptions>(
+ DataMember("sort_keys", &RankOptions::sort_keys),
+ DataMember("null_placement", &RankOptions::null_placement),
+ DataMember("tiebreaker", &RankOptions::tiebreaker));
} // namespace
} // namespace internal
@@ -192,6 +216,14 @@ CumulativeSumOptions::CumulativeSumOptions(std::shared_ptr<Scalar> start, bool s
check_overflow(check_overflow) {}
constexpr char CumulativeSumOptions::kTypeName[];
+RankOptions::RankOptions(std::vector<SortKey> sort_keys, NullPlacement null_placement,
+ RankOptions::Tiebreaker tiebreaker)
+ : FunctionOptions(internal::kRankOptionsType),
+ sort_keys(std::move(sort_keys)),
+ null_placement(null_placement),
+ tiebreaker(tiebreaker) {}
+constexpr char RankOptions::kTypeName[];
+
namespace internal {
void RegisterVectorOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
@@ -202,6 +234,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kSelectKOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeSumOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType));
}
} // namespace internal
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index b5daddb17b..1f4581566f 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -174,6 +174,43 @@ class ARROW_EXPORT SelectKOptions : public FunctionOptions {
std::vector<SortKey> sort_keys;
};
+/// \brief Rank options
+class ARROW_EXPORT RankOptions : public FunctionOptions {
+ public:
+ /// Configure how ties between equal values are handled
+ enum Tiebreaker {
+ /// Ties get the smallest possible rank in sorted order.
+ Min,
+ /// Ties get the largest possible rank in sorted order.
+ Max,
+ /// Ranks are assigned in order of when ties appear in the input.
+ /// This ensures the ranks are a stable permutation of the input.
+ First,
+ /// The ranks span a dense [1, M] interval where M is the number
+ /// of distinct values in the input.
+ Dense
+ };
+
+ explicit RankOptions(std::vector<SortKey> sort_keys = {},
+ NullPlacement null_placement = NullPlacement::AtEnd,
+ Tiebreaker tiebreaker = RankOptions::First);
+ /// Convenience constructor for array inputs
+ explicit RankOptions(SortOrder order,
+ NullPlacement null_placement = NullPlacement::AtEnd,
+ Tiebreaker tiebreaker = RankOptions::First)
+ : RankOptions({SortKey("", order)}, null_placement, tiebreaker) {}
+
+ static constexpr char const kTypeName[] = "RankOptions";
+ static RankOptions Defaults() { return RankOptions(); }
+
+ /// Column key(s) to order by and how to order by these sort keys.
+ std::vector<SortKey> sort_keys;
+ /// Whether nulls and NaNs are placed at the start or at the end
+ NullPlacement null_placement;
+ /// Tiebreaker for dealing with equal values in ranks
+ Tiebreaker tiebreaker;
+};
+
/// \brief Partitioning options for NthToIndices
class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
public:
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 8a108621d3..93ed8a7860 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -1905,6 +1905,221 @@ class SelectKUnstableMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+ static const auto kDefaultRankOptions = RankOptions::Defaults();
+ return &kDefaultRankOptions;
+}
+
+class ArrayRanker : public TypeVisitor {
+ public:
+ ArrayRanker(ExecContext* ctx, const Array& array, const RankOptions& options,
+ Datum* output)
+ : TypeVisitor(),
+ ctx_(ctx),
+ array_(array),
+ options_(options),
+ null_placement_(options.null_placement),
+ tiebreaker_(options.tiebreaker),
+ physical_type_(GetPhysicalType(array.type())),
+ output_(output) {}
+
+ Status Run() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { return RankInternal<TYPE>(); }
+
+ VISIT_SORTABLE_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ template <typename InType>
+ Status RankInternal() {
+ using GetView = GetViewType<InType>;
+ using T = typename GetViewType<InType>::T;
+ using ArrayType = typename TypeTraits<InType>::ArrayType;
+
+ ArrayType arr(array_.data());
+
+ SortOrder order = SortOrder::Ascending;
+ if (!options_.sort_keys.empty()) {
+ order = options_.sort_keys[0].order;
+ }
+ ArraySortOptions array_options(order, null_placement_);
+
+ auto length = array_.length();
+ ARROW_ASSIGN_OR_RAISE(auto sort_indices,
+ MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+ auto sort_begin = sort_indices->GetMutableValues<uint64_t>(1);
+ auto sort_end = sort_begin + length;
+ std::iota(sort_begin, sort_end, 0);
+
+ ARROW_ASSIGN_OR_RAISE(auto array_sorter, GetArraySorter(*physical_type_));
+
+ NullPartitionResult sorted =
+ array_sorter(sort_begin, sort_end, arr, 0, array_options);
+ uint64_t rank;
+
+ ARROW_ASSIGN_OR_RAISE(auto rankings,
+ MakeMutableUInt64Array(uint64(), length, ctx_->memory_pool()));
+ auto out_begin = rankings->GetMutableValues<uint64_t>(1);
+
+ switch (tiebreaker_) {
+ case RankOptions::Dense: {
+ T curr_value, prev_value;
+ rank = 0;
+
+ if (null_placement_ == NullPlacement::AtStart && sorted.null_count() > 0) {
+ rank++;
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+
+ for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end; it++) {
+ curr_value = GetView::LogicalValue(arr.GetView(*it));
+ if (it == sorted.non_nulls_begin || curr_value != prev_value) {
+ rank++;
+ }
+
+ out_begin[*it] = rank;
+ prev_value = curr_value;
+ }
+
+ if (null_placement_ == NullPlacement::AtEnd) {
+ rank++;
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+ break;
+ }
+
+ case RankOptions::First: {
+ rank = 0;
+ for (auto it = sorted.overall_begin(); it < sorted.overall_end(); it++) {
+ out_begin[*it] = ++rank;
+ }
+ break;
+ }
+
+ case RankOptions::Min: {
+ T curr_value, prev_value;
+
+ if (null_placement_ == NullPlacement::AtStart) {
+ rank = 1;
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+
+ for (auto it = sorted.non_nulls_begin; it < sorted.non_nulls_end; it++) {
+ curr_value = GetView::LogicalValue(arr.GetView(*it));
+ if (it == sorted.non_nulls_begin || curr_value != prev_value) {
+ rank = (it - sorted.overall_begin()) + 1;
+ }
+ out_begin[*it] = rank;
+ prev_value = curr_value;
+ }
+
+ if (null_placement_ == NullPlacement::AtEnd) {
+ rank = sorted.non_null_count() + 1;
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+ break;
+ }
+
+ case RankOptions::Max: {
+ // The algorithm for Max is just like Min, but in reverse order.
+ T curr_value, prev_value;
+
+ if (null_placement_ == NullPlacement::AtEnd) {
+ rank = length;
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+
+ for (auto it = sorted.non_nulls_end - 1; it >= sorted.non_nulls_begin; it--) {
+ curr_value = GetView::LogicalValue(arr.GetView(*it));
+ if (it == sorted.non_nulls_end - 1 || curr_value != prev_value) {
+ rank = (it - sorted.overall_begin()) + 1;
+ }
+ out_begin[*it] = rank;
+ prev_value = curr_value;
+ }
+
+ if (null_placement_ == NullPlacement::AtStart) {
+ rank = sorted.null_count();
+ for (auto it = sorted.nulls_begin; it < sorted.nulls_end; it++) {
+ out_begin[*it] = rank;
+ }
+ }
+
+ break;
+ }
+ }
+
+ *output_ = Datum(rankings);
+ return Status::OK();
+ }
+
+ ExecContext* ctx_;
+ const Array& array_;
+ const RankOptions& options_;
+ const NullPlacement null_placement_;
+ const RankOptions::Tiebreaker tiebreaker_;
+ const std::shared_ptr<DataType> physical_type_;
+ Datum* output_;
+};
+
+const FunctionDoc rank_doc(
+ "Compute numerical ranks of an array (1-based)",
+ ("This function computes a rank of the input array.\n"
+ "By default, null values are considered greater than any other value and\n"
+ "are therefore sorted at the end of the input. For floating-point types,\n"
+ "NaNs are considered greater than any other non-null value, but smaller\n"
+ "than null values. The default tiebreaker is to assign ranks in order of\n"
+ "when ties appear in the input.\n"
+ "\n"
+ "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+ {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+ RankMetaFunction()
+ : MetaFunction("rank", Arity::Unary(), rank_doc, GetDefaultRankOptions()) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options, ExecContext* ctx) const {
+ const RankOptions& rank_options = checked_cast<const RankOptions&>(*options);
+ switch (args[0].kind()) {
+ case Datum::ARRAY: {
+ return Rank(*args[0].make_array(), rank_options, ctx);
+ } break;
+ default:
+ break;
+ }
+ return Status::NotImplemented(
+ "Unsupported types for rank operation: "
+ "values=",
+ args[0].ToString());
+ }
+
+ private:
+ Result<Datum> Rank(const Array& array, const RankOptions& options,
+ ExecContext* ctx) const {
+ Datum output;
+ ArrayRanker ranker(ctx, array, options, &output);
+ ARROW_RETURN_NOT_OK(ranker.Run());
+ return output;
+ }
+};
+
} // namespace
Status SortChunkedArray(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
@@ -1918,6 +2133,7 @@ Status SortChunkedArray(ExecContext* ctx, uint64_t* indices_begin, uint64_t* ind
void RegisterVectorSort(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
DCHECK_OK(registry->AddFunction(std::make_shared<SelectKUnstableMetaFunction>()));
+ DCHECK_OK(registry->AddFunction(std::make_shared<RankMetaFunction>()));
}
#undef VISIT_SORTABLE_PHYSICAL_TYPES
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index 50add1aec4..c32b1a1e59 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -53,6 +53,10 @@ std::vector<NullPlacement> AllNullPlacements() {
return {NullPlacement::AtEnd, NullPlacement::AtStart};
}
+std::vector<RankOptions::Tiebreaker> AllTiebreakers() {
+ return {RankOptions::Min, RankOptions::Max, RankOptions::First, RankOptions::Dense};
+}
+
std::ostream& operator<<(std::ostream& os, NullPlacement null_placement) {
os << (null_placement == NullPlacement::AtEnd ? "AtEnd" : "AtStart");
return os;
@@ -1955,5 +1959,228 @@ INSTANTIATE_TEST_SUITE_P(AllNull, TestTableSortIndicesRandom,
testing::Combine(first_sort_keys, num_sort_keys,
testing::Values(1.0)));
+// ----------------------------------------------------------------------
+// Tests for Rank
+
+void AssertRank(const std::shared_ptr<Array>& input, SortOrder order,
+ NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker,
+ const std::shared_ptr<Array>& expected) {
+ const std::vector<SortKey> sort_keys{SortKey("foo", order)};
+ RankOptions options(sort_keys, null_placement, tiebreaker);
+ ARROW_SCOPED_TRACE("options = ", options.ToString());
+ ASSERT_OK_AND_ASSIGN(auto actual, CallFunction("rank", {input}, &options));
+ ValidateOutput(actual);
+ AssertDatumsEqual(expected, actual, /*verbose=*/true);
+}
+
+void AssertRankEmpty(std::shared_ptr<DataType> type, SortOrder order,
+ NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker) {
+ AssertRank(ArrayFromJSON(type, "[]"), order, null_placement, tiebreaker,
+ ArrayFromJSON(uint64(), "[]"));
+ AssertRank(ArrayFromJSON(type, "[null]"), order, null_placement, tiebreaker,
+ ArrayFromJSON(uint64(), "[1]"));
+}
+
+void AssertRankSimple(const std::shared_ptr<Array>& input, NullPlacement null_placement,
+ RankOptions::Tiebreaker tiebreaker) {
+ auto expected_asc = ArrayFromJSON(uint64(), "[3, 4, 2, 1, 5]");
+ AssertRank(input, SortOrder::Ascending, null_placement, tiebreaker, expected_asc);
+
+ auto expected_desc = ArrayFromJSON(uint64(), "[3, 2, 4, 5, 1]");
+ AssertRank(input, SortOrder::Descending, null_placement, tiebreaker, expected_desc);
+}
+
+void AssertRankAllTiebreakers(const std::shared_ptr<Array>& input) {
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[3, 1, 4, 6, 4, 6, 1]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[3, 2, 5, 7, 5, 7, 2]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::First,
+ ArrayFromJSON(uint64(), "[3, 1, 4, 6, 5, 7, 2]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[2, 1, 3, 4, 3, 4, 1]"));
+
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[5, 3, 6, 1, 6, 1, 3]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[5, 4, 7, 2, 7, 2, 4]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtStart, RankOptions::First,
+ ArrayFromJSON(uint64(), "[5, 3, 6, 1, 7, 2, 4]"));
+ AssertRank(input, SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[3, 2, 4, 1, 4, 1, 2]"));
+
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[3, 4, 1, 6, 1, 6, 4]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[3, 5, 2, 7, 2, 7, 5]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtEnd, RankOptions::First,
+ ArrayFromJSON(uint64(), "[3, 4, 1, 6, 2, 7, 5]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[2, 3, 1, 4, 1, 4, 3]"));
+
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtStart, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[5, 6, 3, 1, 3, 1, 6]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtStart, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[5, 7, 4, 2, 4, 2, 7]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtStart, RankOptions::First,
+ ArrayFromJSON(uint64(), "[5, 6, 3, 1, 4, 2, 7]"));
+ AssertRank(input, SortOrder::Descending, NullPlacement::AtStart, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[3, 4, 2, 1, 2, 1, 4]"));
+}
+
+TEST(TestRankForReal, RankReal) {
+ for (auto real_type : ::arrow::FloatingPointTypes()) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(real_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(ArrayFromJSON(real_type, "[2.1, 3.2, 1.0, 0.0, 5.5]"),
+ null_placement, tiebreaker);
+ }
+ }
+
+ AssertRankAllTiebreakers(
+ ArrayFromJSON(real_type, "[1.2, 0.0, 5.3, null, 5.3, null, 0.0]"));
+ }
+}
+
+TEST(TestRankForIntegral, RankIntegral) {
+ for (auto integer_type : ::arrow::IntTypes()) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(integer_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(ArrayFromJSON(integer_type, "[2, 3, 1, 0, 5]"), null_placement,
+ tiebreaker);
+ }
+ }
+ AssertRankAllTiebreakers(ArrayFromJSON(integer_type, "[1, 0, 5, null, 5, null, 0]"));
+ }
+}
+
+TEST(TestRankForBool, RankBool) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(boolean(), order, null_placement, tiebreaker);
+ }
+
+ auto simple_bool = ArrayFromJSON(boolean(), "[false, true]");
+ AssertRank(simple_bool, SortOrder::Ascending, null_placement, tiebreaker,
+ ArrayFromJSON(uint64(), "[1, 2]"));
+ AssertRank(simple_bool, SortOrder::Descending, null_placement, tiebreaker,
+ ArrayFromJSON(uint64(), "[2, 1]"));
+
+ auto array =
+ ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]");
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[3, 1, 3, 6, 3, 6, 1]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[5, 2, 5, 7, 5, 7, 2]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::First,
+ ArrayFromJSON(uint64(), "[3, 1, 4, 6, 5, 7, 2]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[2, 1, 2, 3, 2, 3, 1]"));
+
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[5, 3, 5, 1, 5, 1, 3]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[7, 4, 7, 2, 7, 2, 4]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtStart, RankOptions::First,
+ ArrayFromJSON(uint64(), "[5, 3, 6, 1, 7, 2, 4]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Ascending, NullPlacement::AtStart, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[3, 2, 3, 1, 3, 1, 2]"));
+
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[1, 4, 1, 6, 1, 6, 4]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[3, 5, 3, 7, 3, 7, 5]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtEnd, RankOptions::First,
+ ArrayFromJSON(uint64(), "[1, 4, 2, 6, 3, 7, 5]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtEnd, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[1, 2, 1, 3, 1, 3, 2]"));
+
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtStart, RankOptions::Min,
+ ArrayFromJSON(uint64(), "[3, 6, 3, 1, 3, 1, 6]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtStart, RankOptions::Max,
+ ArrayFromJSON(uint64(), "[5, 7, 5, 2, 5, 2, 7]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtStart, RankOptions::First,
+ ArrayFromJSON(uint64(), "[3, 6, 4, 1, 5, 2, 7]"));
+ AssertRank(ArrayFromJSON(boolean(), "[true, false, true, null, true, null, false]"),
+ SortOrder::Descending, NullPlacement::AtStart, RankOptions::Dense,
+ ArrayFromJSON(uint64(), "[2, 3, 2, 1, 2, 1, 3]"));
+ }
+ }
+}
+
+TEST(TestRankForTemporal, RankTemporal) {
+ for (auto temporal_type : ::arrow::TemporalTypes()) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(temporal_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(ArrayFromJSON(temporal_type, "[2, 3, 1, 0, 5]"), null_placement,
+ tiebreaker);
+ }
+ }
+ AssertRankAllTiebreakers(ArrayFromJSON(temporal_type, "[1, 0, 5, null, 5, null, 0]"));
+ }
+}
+
+TEST(TestRankForStrings, RankStrings) {
+ for (auto string_type : ::arrow::StringTypes()) {
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(string_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(ArrayFromJSON(string_type, R"(["b", "c", "a", "", "d"])"),
+ null_placement, tiebreaker);
+ }
+ }
+ AssertRankAllTiebreakers(
+ ArrayFromJSON(string_type, R"(["a", "", "e", null, "e", null, ""])"));
+ }
+}
+
+TEST(TestRankForFixedSizeBinary, RankFixedSizeBinary) {
+ auto binary_type = fixed_size_binary(3);
+ for (auto null_placement : AllNullPlacements()) {
+ for (auto tiebreaker : AllTiebreakers()) {
+ for (auto order : AllOrders()) {
+ AssertRankEmpty(binary_type, order, null_placement, tiebreaker);
+ }
+
+ AssertRankSimple(
+ ArrayFromJSON(binary_type, R"(["bbb", "ccc", "aaa", " ", "ddd"])"),
+ null_placement, tiebreaker);
+ }
+ }
+ AssertRankAllTiebreakers(
+ ArrayFromJSON(binary_type, R"(["aaa", " ", "eee", null, "eee", null, " "])"));
+}
+
} // namespace compute
} // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index c373fa8988..80aea63dfa 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1665,10 +1665,13 @@ in the respective option classes.
+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) |
+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(4) \(5) |
+| rank | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`RankOptions` | \(4) |
+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
-| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(4) |
+| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(5) \(6) |
+-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(5) |
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+
* \(1) The output is an array of indices into the input, that define a
stable sort of the input.
@@ -1682,11 +1685,13 @@ in the respective option classes.
:func:`std::nth_element`). *N* is given in
:member:`PartitionNthOptions::pivot`.
-* \(4) The input can be an array, chunked array, record batch or
+* \(4) The output is a one-based numerical array of ranks
+
+* \(5) The input can be an array, chunked array, record batch or
table. If the input is a record batch or table, one or more sort
keys must be specified.
-* \(5) The output is an array of indices into the input, that define a
+* \(6) The output is an array of indices into the input, that define a
non-stable sort of the input.
.. _cpp-compute-vector-structural-transforms: