You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/28 15:45:06 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12963: ARROW-16234: [C++] Vector Kernel for Rank

pitrou commented on code in PR #12963:
URL: https://github.com/apache/arrow/pull/12963#discussion_r861034762


##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -95,6 +95,13 @@ enum class NullPlacement {
   AtEnd,
 };
 
+enum class Tiebreaker {
+  Lowest,

Review Comment:
   I have no strong feelings on this but I'll note that different terminology is used elsewhere:
   * [Scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html) uses "min", "max", "ordinal", "dense" respectively
   * [R](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/rank) uses "min", "max", "first" and doesn't seem to have an equivalent for "dense"



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"

Review Comment:
   Nit: no capitalization needed here :-)
   ```suggestion
        "By default, null values are considered greater than any other value and\n"
   ```



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultSortOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = static_cast<const RankOptions&>(*options);

Review Comment:
   We use `checked_cast` for this which compiles to a `dynamic_cast` in debug mode.



##########
docs/source/cpp/compute.rst:
##########
@@ -1643,6 +1643,8 @@ in the respective option classes.
 +-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
 | sort_indices          | Unary      | Boolean, Numeric, Temporal, Binary- and String-like     | UInt64            | :struct:`SortOptions`          | \(1) \(4)      |
 +-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| rank                  | Unary      | Boolean, Numeric, Temporal, Binary- and String-like     | UInt64            | :struct:`RankOptions`          | \(6)           |

Review Comment:
   We would probably like to keep this table alphabetically-order in function name.



##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -95,6 +95,13 @@ enum class NullPlacement {
   AtEnd,
 };
 
+enum class Tiebreaker {

Review Comment:
   I assume this is specific to the rank function, so perhaps put this inside `RankOptions` (and then perhaps make this a simple `enum` to be able to write e.g. `RankOptions::Lowest`).



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultSortOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = static_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sortIndices, CallFunction("array_sort_indices", {array},
+                                                         &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));
+
+    auto* indices = sortIndices.make_array()->data()->GetValues<uint64_t>(1);
+    auto out_rankings = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank = 0;
+    Datum prevValue, currValue;
+
+    if (options.tiebreaker == Tiebreaker::Dense) {
+      for (auto i = 0; i < out_size; i++) {
+        currValue = array.GetScalar(indices[i]).ValueOrDie();

Review Comment:
   Hmm, `GetScalar` is really going to be inefficient for this as it allocates a new `std::shared_ptr<Scalar>` every time.
   Instead, we should specialize the loop based on the actual datatype. Something like `VisitArrayDataInline` may help.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultSortOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = static_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sortIndices, CallFunction("array_sort_indices", {array},
+                                                         &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));
+
+    auto* indices = sortIndices.make_array()->data()->GetValues<uint64_t>(1);
+    auto out_rankings = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank = 0;
+    Datum prevValue, currValue;
+
+    if (options.tiebreaker == Tiebreaker::Dense) {
+      for (auto i = 0; i < out_size; i++) {
+        currValue = array.GetScalar(indices[i]).ValueOrDie();
+        if (i > 0 && currValue == prevValue) {
+        } else {
+          ++rank;
+        }
+        out_rankings[indices[i]] = rank;
+        prevValue = currValue;
+      }
+    } else if (options.tiebreaker == Tiebreaker::First) {
+      for (auto i = 0; i < out_size; i++) {
+        rank = i + 1;
+        out_rankings[indices[i]] = rank;
+      }
+    } else if (options.tiebreaker == Tiebreaker::Lowest) {
+      for (auto i = 0; i < out_size; i++) {
+        currValue = array.GetScalar(indices[i]).ValueOrDie();
+        if (i > 0 && currValue == prevValue) {
+        } else {
+          rank = i + 1;
+        }
+        out_rankings[indices[i]] = rank;
+        prevValue = currValue;
+      }
+    } else if (options.tiebreaker == Tiebreaker::Highest) {
+      auto currentTieCount = 0;
+      for (auto i = 0; i < out_size; i++) {
+        currValue = array.GetScalar(indices[i]).ValueOrDie();
+        if (i > 0 && currValue == prevValue) {
+          currentTieCount++;
+        } else {
+          currentTieCount = 0;
+        }
+        rank = i + 1;
+
+        // This can be inefficient when dealing many tied values

Review Comment:
   Why not simply loop on the array in reverse order?



##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -174,6 +181,23 @@ class ARROW_EXPORT SelectKOptions : public FunctionOptions {
   std::vector<SortKey> sort_keys;
 };
 
+/// \brief Rank options
+class ARROW_EXPORT RankOptions : public FunctionOptions {
+ public:
+  explicit RankOptions(SortOrder order = SortOrder::Ascending,
+                       NullPlacement null_placement = NullPlacement::AtEnd,
+                       Tiebreaker tiebreaker = Tiebreaker::First);

Review Comment:
   Nit: `TieBreaker tie_breaker` to keep naming consistent.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultSortOptions()) {}

Review Comment:
   `GetDefaultSortOptions` doesn't seem right here?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,110 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, Null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultSortOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = static_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sortIndices, CallFunction("array_sort_indices", {array},

Review Comment:
   Please try to follow the coding conventions: `lower_case` for local variables (so `sort_indices` here).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org