You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/29 13:48:07 UTC

[GitHub] [arrow] edponce commented on a diff in pull request #12963: ARROW-16234: [C++] Vector Kernel for Rank

edponce commented on code in PR #12963:
URL: https://github.com/apache/arrow/pull/12963#discussion_r861750714


##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}

Review Comment:
   We can make `kDefaultRankOptions` a static member of `RankMetaFunction`, unless we think it will be required for other use cases. No real difference in what gets generated but it encapsulates the `RankMetaFunction`-related data.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()

Review Comment:
   For the sakes of my own understanding, why is this a `MetaFunction` and not a `RankFunction` that invokes `array_sort_indices`?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultRankOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = checked_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices, CallFunction("array_sort_indices", {array},
+                                                          &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));
+
+    auto* indices = sort_indices.make_array()->data()->GetValues<uint64_t>(1);
+    auto out_rankings = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank = 0;
+    Datum prevValue, currValue;

Review Comment:
   `prevValue` and `currValue` are not used in all the `switch-cases`, so I suggest to make them local to their corresponding `case` blocks.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultRankOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = checked_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices, CallFunction("array_sort_indices", {array},
+                                                          &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));

Review Comment:
   It seems the ranking will always be a UInt64Array, so docstrings should state this.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultRankOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = checked_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices, CallFunction("array_sort_indices", {array},
+                                                          &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));
+
+    auto* indices = sort_indices.make_array()->data()->GetValues<uint64_t>(1);
+    auto out_rankings = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank = 0;
+    Datum prevValue, currValue;
+
+    switch (options.tiebreaker) {
+      case RankOptions::Dense:
+        for (auto i = 0; i < out_size; i++) {
+          currValue = array.GetScalar(indices[i]).ValueOrDie();
+          if (i > 0 && currValue == prevValue) {
+          } else {
+            ++rank;
+          }
+          out_rankings[indices[i]] = rank;
+          prevValue = currValue;
+        }
+        break;
+      case RankOptions::First:
+        for (auto i = 0; i < out_size; i++) {
+          rank = i + 1;

Review Comment:
   `out_size` is set to the length of the input `Array` but here it is used to index the `indices` array.
   Even though, AFAIK, these two arrays will always be of the same size, I recommend to get the size value from the indices array.



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -1909,6 +1909,120 @@ class SelectKUnstableMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// Rank implementation
+
+const RankOptions* GetDefaultRankOptions() {
+  static const auto kDefaultRankOptions = RankOptions::Defaults();
+  return &kDefaultRankOptions;
+}
+
+const FunctionDoc rank_doc(
+    "Returns the ranking of an array",
+    ("This function computes a rank of the input array.\n"
+     "By default, null values are considered greater than any other value and\n"
+     "are therefore sorted at the end of the input. For floating-point types,\n"
+     "NaNs are considered greater than any other non-null value, but smaller\n"
+     "than null values. The default tiebreaker is to assign ranks in order of\n"
+     "when ties appear in the input\n"
+     "\n"
+     "The handling of nulls, NaNs and tiebreakers can be changed in RankOptions."),
+    {"input"}, "RankOptions");
+
+class RankMetaFunction : public MetaFunction {
+ public:
+  RankMetaFunction()
+      : MetaFunction("rank", Arity::Unary(), &rank_doc, GetDefaultRankOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options, ExecContext* ctx) const {
+    const RankOptions& sort_options = checked_cast<const RankOptions&>(*options);
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        return Rank(*args[0].make_array(), sort_options, ctx);
+      } break;
+      default:
+        break;
+    }
+    return Status::NotImplemented(
+        "Unsupported types for rank operation: "
+        "values=",
+        args[0].ToString());
+  }
+
+ private:
+  Result<Datum> Rank(const Array& array, const RankOptions& options,
+                     ExecContext* ctx) const {
+    ArraySortOptions array_options(options.order, options.null_placement);
+
+    ARROW_ASSIGN_OR_RAISE(auto sort_indices, CallFunction("array_sort_indices", {array},
+                                                          &array_options, ctx));
+
+    auto out_size = array.length();
+    ARROW_ASSIGN_OR_RAISE(auto rankings,
+                          MakeMutableUInt64Array(uint64(), out_size, ctx->memory_pool()));
+
+    auto* indices = sort_indices.make_array()->data()->GetValues<uint64_t>(1);
+    auto out_rankings = rankings->GetMutableValues<uint64_t>(1);
+    uint64_t rank = 0;
+    Datum prevValue, currValue;
+
+    switch (options.tiebreaker) {
+      case RankOptions::Dense:
+        for (auto i = 0; i < out_size; i++) {
+          currValue = array.GetScalar(indices[i]).ValueOrDie();
+          if (i > 0 && currValue == prevValue) {
+          } else {

Review Comment:
   There are empty `if` blocks. Invert logic so that the `if` corresponds to the current `else` path. Similarly, in other `cases`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org