You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/05/16 17:01:35 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35280: GH-29887: [C++] Implement dictionary array sorting

pitrou commented on code in PR #35280:
URL: https://github.com/apache/arrow/pull/35280#discussion_r1195444005


##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -192,9 +192,9 @@ struct NullPartitionResult {
 // Move nulls (not null-like values) to end of array.
 //
 // `offset` is used when this is called on a chunk of a chunked array
-template <typename Partitioner>
+template <typename ArrayType, typename Partitioner>
 NullPartitionResult PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
-                                       const Array& values, int64_t offset,
+                                       const ArrayType& values, int64_t offset,

Review Comment:
   Hmm, is it necessary to pass the concrete array type? I tried to avoid combinatorial explosion here, as the `partitioner` call will generate a non-trivial amount of code.



##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,109 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.
+    // https://github.com/apache/arrow/issues/35437
+    auto dict_values = dict_array.dictionary();
+    auto dict_indices = dict_array.indices();
+
+    // Algorithm:
+    // 1) Use the Rank function to get an exactly-equivalent-order array
+    //    of the dictionary values, but with a datatype that's friendlier to
+    //    sorting (uint64).
+    // 2) Act as if we were sorting a dictionary array with the same indices,
+    //    but with the ranks as dictionary values.
+    // 2a) Dictionary-decode the ranks by calling Take.
+    // 2b) Sort the decoded ranks. Not only those are uint64, they are dense
+    //     in a [0, k) range where k is the number of unique dictionary values.
+    //     Therefore, unless the dictionary is very large, a fast counting sort
+    //     will be used.
+    //
+    // The bottom line is that performance will usually be much better
+    // (potentially an order of magnitude faster) than by naively decoding
+    // the original dictionary and sorting the decoded version.
+
+    std::shared_ptr<Array> decoded_ranks;
+    // Skip the rank/take steps for cases with only nulls
+    if (IsAllNulls(dict_array)) {
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks, MakeArrayOfNull(uint64(), dict_array.length(),
+                                                           ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto ranks, RanksWithNulls(dict_values, ctx));
+
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            Take(*ranks, *dict_indices, TakeOptions::Defaults(), ctx));
+    }
+
+    DCHECK_EQ(decoded_ranks->type_id(), Type::UINT64);
+    DCHECK_EQ(decoded_ranks->length(), dict_array.length());
+    ARROW_ASSIGN_OR_RAISE(auto rank_sorter, GetArraySorter(*decoded_ranks->type()));
+
+    return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options, ctx);
+  }
+
+ private:
+  static bool IsAllNulls(const DictionaryArray& dict_array) {
+    auto indices = dict_array.indices();
+    auto values = dict_array.dictionary();
+    if (values->null_count() == values->length() ||
+        indices->null_count() == indices->length()) {
+      return true;
+    }
+    for (int64_t i = 0; i < dict_array.length(); ++i) {

Review Comment:
   Is it actually worth doing this? This is just a performance optimization for all-null dictionary arrays, but not necessary for correctness?



##########
cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc:
##########
@@ -82,6 +100,64 @@ static void ArraySortFuncInt64Benchmark(benchmark::State& state, const Runner& r
   ArraySortFuncBenchmark(state, runner, values);
 }
 
+template <typename Runner>
+static void ArraySortFuncInt64DictBenchmark(benchmark::State& state, const Runner& runner,
+                                            int64_t min, int64_t max, int32_t dict_size) {
+  RegressionArgs args(state);
+
+  const double null_proportion = args.null_proportion / 2;
+  const int64_t array_size = args.size / sizeof(int64_t);
+
+  auto rand = random::RandomArrayGenerator(kSeed);
+  auto dict_values = rand.Int64(dict_size, min, max, null_proportion);
+  auto dict_indices = rand.Int64(array_size, 0, dict_size - 1, null_proportion);
+  auto dict_array = *DictionaryArray::FromArrays(dict_indices, dict_values);
+
+  ArraySortFuncBenchmark(state, runner, dict_array);
+}
+
+template <typename Runner>
+static void ArraySortFuncStringBenchmark(benchmark::State& state, const Runner& runner,
+                                         int32_t min_length, int32_t max_length) {
+  RegressionArgs args(state);
+
+  const int64_t array_size =
+      GetStringArraySize(args.size, min_length, max_length, args.null_proportion);
+
+  auto rand = random::RandomArrayGenerator(kSeed);
+  auto values = rand.String(array_size, min_length, max_length, args.null_proportion);
+
+  ArraySortFuncBenchmark(state, runner, values);
+}
+
+template <typename Runner>
+static void ArraySortFuncStringDictBenchmark(benchmark::State& state,
+                                             const Runner& runner, int32_t min_length,
+                                             int32_t max_length, int32_t dict_size) {
+  RegressionArgs args(state);
+
+  const double null_proportion = args.null_proportion / 2;
+  const int64_t array_size = args.size / sizeof(int64_t);

Review Comment:
   This is debatable. If we want to compare against the sort speed of an equivalent decoded array, we would like the array size to match.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org