You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "westonpace (via GitHub)" <gi...@apache.org> on 2023/05/01 17:42:55 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #35280: GH-29887: [C++] Implement dictionary array sorting

westonpace commented on code in PR #35280:
URL: https://github.com/apache/arrow/pull/35280#discussion_r1181732100


##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,116 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.

Review Comment:
   Seems reasonable. Do you want to open a separate issue for this follow up and tag that issue here?  E.g. `TODO (GH-123): These methods...`



##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,116 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.
+    auto dict_values = dict_array.dictionary();
+    auto dict_indices = dict_array.indices();
+
+    // Algorithm:
+    // 1) Use the Rank function to get an exactly-equivalent-order array
+    //    of the dictionary values, but with a datatype that's friendlier to
+    //    sorting (uint64).
+    // 2) Act as if we were sorting a dictionary array with the same indices,
+    //    but with the ranks as dictionary values.
+    // 2a) Dictionary-decode the ranks by calling Take.
+    // 2b) Sort the decoded ranks. Not only those are uint64, they are dense
+    //     in a [0, k) range where k is the number of unique dictionary values.
+    //     Therefore, unless the dictionary is very large, a fast counting sort
+    //     will be used.
+    //
+    // The bottom line is that performance will usually be much better
+    // (potentially an order of magnitude faster) than by naively decoding
+    // the original dictionary and sorting the decoded version.
+
+    std::shared_ptr<Array> decoded_ranks;
+    // Skip the rank/take steps for cases with only nulls
+    if (IsAllNulls(dict_array)) {
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            MakeNullUInt64Array(dict_array.length(), ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto ranks, RanksWithNulls(dict_values, ctx));
+
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            Take(*ranks, *dict_indices, TakeOptions::Defaults(), ctx));
+    }
+
+    DCHECK_EQ(decoded_ranks->type_id(), Type::UINT64);
+    DCHECK_EQ(decoded_ranks->length(), dict_array.length());
+    ARROW_ASSIGN_OR_RAISE(auto rank_sorter, GetArraySorter(*decoded_ranks->type()));
+
+    return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options, ctx);
+  }
+
+ private:
+  static bool IsAllNulls(const DictionaryArray& dict_array) {
+    auto indices = dict_array.indices();
+    auto values = dict_array.dictionary();
+    if (values->null_count() == values->length() ||
+        indices->null_count() == indices->length()) {
+      return true;
+    }
+    for (int64_t i = 0; i < dict_array.length(); ++i) {
+      // TODO: Special handling for dictionary types in Array::IsValid/Null?
+      if (indices->IsValid(i) && values->IsValid(dict_array.GetValueIndex(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static Result<std::shared_ptr<Array>> MakeNullUInt64Array(int64_t length,

Review Comment:
   Use `arrow::MakeArrayOfNull` in `src/arrow/array/util.cc` instead?



##########
cpp/src/arrow/compute/kernels/vector_sort_test.cc:
##########
@@ -368,6 +368,129 @@ TEST(ArraySortIndicesFunction, Array) {
   AssertDatumsEqual(expected, actual, /*verbose=*/true);
 }
 
+TEST(ArraySortIndicesFunction, DictionaryArray) {
+  // Decoded dictionary array:
+  // (["b", "c", null, null, null, "b", "c", "c", "a"])
+
+  std::vector<std::string> expected_str = {
+      "[8, 0, 5, 1, 6, 7, 2, 3, 4]",  // SortOrder::Ascending NullPlacement::AtEnd
+      "[2, 3, 4, 8, 0, 5, 1, 6, 7]",  // SortOrder::Ascending NullPlacement::AtStart
+      "[1, 6, 7, 0, 5, 8, 2, 3, 4]",  // SortOrder::Descending NullPlacement::AtEnd
+      "[2, 3, 4, 1, 6, 7, 0, 5, 8]"   // SortOrder::Descending NullPlacement::AtStart
+  };
+
+  for (auto index_type : all_dictionary_index_types()) {

Review Comment:
   ```suggestion
     for (const auto& index_type : all_dictionary_index_types()) {
   ```



##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,116 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.
+    auto dict_values = dict_array.dictionary();
+    auto dict_indices = dict_array.indices();
+
+    // Algorithm:
+    // 1) Use the Rank function to get an exactly-equivalent-order array
+    //    of the dictionary values, but with a datatype that's friendlier to
+    //    sorting (uint64).
+    // 2) Act as if we were sorting a dictionary array with the same indices,
+    //    but with the ranks as dictionary values.
+    // 2a) Dictionary-decode the ranks by calling Take.
+    // 2b) Sort the decoded ranks. Not only those are uint64, they are dense
+    //     in a [0, k) range where k is the number of unique dictionary values.
+    //     Therefore, unless the dictionary is very large, a fast counting sort
+    //     will be used.
+    //
+    // The bottom line is that performance will usually be much better
+    // (potentially an order of magnitude faster) than by naively decoding
+    // the original dictionary and sorting the decoded version.
+
+    std::shared_ptr<Array> decoded_ranks;
+    // Skip the rank/take steps for cases with only nulls
+    if (IsAllNulls(dict_array)) {
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            MakeNullUInt64Array(dict_array.length(), ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto ranks, RanksWithNulls(dict_values, ctx));
+
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            Take(*ranks, *dict_indices, TakeOptions::Defaults(), ctx));
+    }
+
+    DCHECK_EQ(decoded_ranks->type_id(), Type::UINT64);
+    DCHECK_EQ(decoded_ranks->length(), dict_array.length());
+    ARROW_ASSIGN_OR_RAISE(auto rank_sorter, GetArraySorter(*decoded_ranks->type()));
+
+    return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options, ctx);
+  }
+
+ private:
+  static bool IsAllNulls(const DictionaryArray& dict_array) {
+    auto indices = dict_array.indices();
+    auto values = dict_array.dictionary();
+    if (values->null_count() == values->length() ||
+        indices->null_count() == indices->length()) {
+      return true;
+    }
+    for (int64_t i = 0; i < dict_array.length(); ++i) {
+      // TODO: Special handling for dictionary types in Array::IsValid/Null?

Review Comment:
   I'm not sure what this todo is referring to.  Are you suggesting we move this logic into `DictionaryArray`?  @felipecrv has been handling special cases for `IsNull/IsValid` recently and might be interested too.



##########
cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc:
##########
@@ -82,6 +86,55 @@ static void ArraySortFuncInt64Benchmark(benchmark::State& state, const Runner& r
   ArraySortFuncBenchmark(state, runner, values);
 }
 
+template <typename Runner>
+static void ArraySortFuncInt64DictBenchmark(benchmark::State& state, const Runner& runner,
+                                            int64_t min, int64_t max) {
+  RegressionArgs args(state);
+
+  const int64_t array_size = args.size / sizeof(int64_t);
+
+  auto rand = random::RandomArrayGenerator(kSeed);
+  auto dict_values = rand.Int64(kDictionarySize, min, max, args.null_proportion / 2);
+  auto dict_indices =
+      rand.Int64(array_size, 0, kDictionarySize - 1, args.null_proportion / 2);
+  auto dict_array = *DictionaryArray::FromArrays(dict_indices, dict_values);
+
+  ArraySortFuncBenchmark(state, runner, dict_array);
+}
+
+template <typename Runner>
+static void ArraySortFuncStringBenchmark(benchmark::State& state, const Runner& runner) {

Review Comment:
   I'm not sure if I'm just confused or maybe this PR needs a rebase from main but it appears there is already an `ArraySortFuncStringBenchmark` at https://github.com/apache/arrow/blob/be12888997c81b1fb7947f6284be1256edd4d3e4/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc#L103
   
   Also, the version I linked to uses the actual length of the strings instead of `sizeof(int64_t)` which I think is more consistent.  Then `args.size` is the size in bytes and items processed is the number of records.



##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,116 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.
+    auto dict_values = dict_array.dictionary();
+    auto dict_indices = dict_array.indices();
+
+    // Algorithm:
+    // 1) Use the Rank function to get an exactly-equivalent-order array
+    //    of the dictionary values, but with a datatype that's friendlier to
+    //    sorting (uint64).
+    // 2) Act as if we were sorting a dictionary array with the same indices,
+    //    but with the ranks as dictionary values.
+    // 2a) Dictionary-decode the ranks by calling Take.
+    // 2b) Sort the decoded ranks. Not only those are uint64, they are dense
+    //     in a [0, k) range where k is the number of unique dictionary values.
+    //     Therefore, unless the dictionary is very large, a fast counting sort
+    //     will be used.
+    //
+    // The bottom line is that performance will usually be much better
+    // (potentially an order of magnitude faster) than by naively decoding
+    // the original dictionary and sorting the decoded version.
+
+    std::shared_ptr<Array> decoded_ranks;
+    // Skip the rank/take steps for cases with only nulls
+    if (IsAllNulls(dict_array)) {
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            MakeNullUInt64Array(dict_array.length(), ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto ranks, RanksWithNulls(dict_values, ctx));
+
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            Take(*ranks, *dict_indices, TakeOptions::Defaults(), ctx));
+    }
+
+    DCHECK_EQ(decoded_ranks->type_id(), Type::UINT64);
+    DCHECK_EQ(decoded_ranks->length(), dict_array.length());
+    ARROW_ASSIGN_OR_RAISE(auto rank_sorter, GetArraySorter(*decoded_ranks->type()));
+
+    return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options, ctx);
+  }
+
+ private:
+  static bool IsAllNulls(const DictionaryArray& dict_array) {
+    auto indices = dict_array.indices();
+    auto values = dict_array.dictionary();
+    if (values->null_count() == values->length() ||
+        indices->null_count() == indices->length()) {
+      return true;
+    }
+    for (int64_t i = 0; i < dict_array.length(); ++i) {
+      // TODO: Special handling for dictionary types in Array::IsValid/Null?
+      if (indices->IsValid(i) && values->IsValid(dict_array.GetValueIndex(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static Result<std::shared_ptr<Array>> MakeNullUInt64Array(int64_t length,
+                                                            MemoryPool* pool) {
+    ARROW_ASSIGN_OR_RAISE(auto bitmap, AllocateEmptyBitmap(length, pool));
+    auto data = ArrayData::Make(uint64(), length, {std::move(bitmap), nullptr},
+                                /*null_count=*/length);
+    return MakeArray(data);
+  }
+
+  static Result<std::shared_ptr<Array>> RanksWithNulls(
+      const std::shared_ptr<Array>& array, ExecContext* ctx) {
+    // Notes:
+    // * The order is always ascending here, since the goal is to produce
+    //   an exactly-equivalent-order of the dictionary values.
+    // * We're going to re-emit nulls in the output, so we can just always consider
+    //   them "at the end".  Note that choosing AtStart would merely shift other
+    //   ranks by 1 if there are any nulls...
+    RankOptions rank_options(SortOrder::Ascending, NullPlacement::AtEnd,
+                             RankOptions::Dense);
+
+    // XXX Should this support Type::NA?
+    auto data = array->data();
+    std::shared_ptr<Buffer> null_bitmap;
+    if (array->null_count() > 0) {
+      null_bitmap = array->null_bitmap();
+      data = array->data()->Copy();
+      if (data->offset > 0) {
+        ARROW_ASSIGN_OR_RAISE(null_bitmap, arrow::internal::CopyBitmap(
+                                               ctx->memory_pool(), null_bitmap->data(),
+                                               data->offset, data->length));
+      }
+      data->buffers[0] = nullptr;
+      data->null_count = 0;
+    }

Review Comment:
   I'm not sure I follow what is happening here.  Is `data` used after this if statement?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org