You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/29 07:02:21 UTC

[GitHub] [arrow] cyb70289 opened a new pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

cyb70289 opened a new pull request #9358:
URL: https://github.com/apache/arrow/pull/9358


   For integers within limited value range, a histogram approach is useful
   to reduce memory footprint and improve performance.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r569193045



##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
   }
 };
 
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+  using CType = typename InType::c_type;
+
+  CType min;
+  std::vector<uint64_t> counts;  // counts[i]: # of values equals i + min
+
+  // indices to adjacent non-empty bins covering current quantile
+  struct AdjacentBins {
+    int left_index;
+    int right_index;
+    uint64_t total_count;  // accumulated counts till left_index (inclusive)
+  };
+
+  CountQuantiler(CType min, CType max) {
+    uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+    DCHECK_LT(value_range, 1 << 30);
+    this->min = min;
+    this->counts.resize(value_range);
+  }
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // count values in all chunks, ignore nulls
+    const Datum& datum = batch[0];
+    const int64_t in_length = datum.length() - datum.null_count();
+    if (in_length > 0) {
+      for (auto& c : this->counts) c = 0;
+      for (const auto& array : datum.chunks()) {
+        const ArrayData& data = *array->data();
+        const CType* values = data.GetValues<CType>(1);
+        VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                            [&](int64_t pos, int64_t len) {
+                              for (int64_t i = 0; i < len; ++i) {
+                                ++this->counts[values[pos + i] - this->min];
+                              }
+                            });
+      }
+    }
+
+    // prepare out array
+    int64_t out_length = options.q.size();
+    if (in_length == 0) {
+      out_length = 0;  // input is empty or only contains null, return empty array
+    }
+    // out type depends on options
+    const bool is_datapoint = IsDataPoint(options);
+    const std::shared_ptr<DataType> out_type =
+        is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+    auto out_data = ArrayData::Make(out_type, out_length, 0);
+    out_data->buffers.resize(2, nullptr);
+
+    // calculate quantiles
+    if (out_length > 0) {
+      const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+      KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+                             ctx->Allocate(out_length * out_bit_width / 8));
+
+      // find quantiles in ascending order
+      std::vector<int64_t> q_indices(out_length);
+      std::iota(q_indices.begin(), q_indices.end(), 0);
+      std::sort(q_indices.begin(), q_indices.end(),
+                [&options](int64_t left_index, int64_t right_index) {
+                  return options.q[left_index] < options.q[right_index];
+                });
+
+      AdjacentBins bins{0, 0, this->counts[0]};
+      if (is_datapoint) {
+        CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileAtDataPoint(
+              in_length, &bins, options.q[q_index], options.interpolation);
+        }
+      } else {
+        double* out_buffer = out_data->template GetMutableValues<double>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+                                                    options.interpolation);
+        }
+      }
+    }
+
+    *out = Datum(std::move(out_data));
+  }
+
+  // return quantile located exactly at some input data point
+  CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+                               enum QuantileOptions::Interpolation interpolation) {
+    const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+    while (datapoint_index >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(datapoint_index, bins->total_count);
+    return static_cast<CType>(bins->left_index + this->min);
+  }
+
+  // return quantile interpolated from adjacent input data points
+  double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+                             enum QuantileOptions::Interpolation interpolation) {
+    const double index = (in_length - 1) * q;
+    const uint64_t index_floor = static_cast<uint64_t>(index);
+    const double fraction = index - index_floor;
+
+    while (index_floor >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(index_floor, bins->total_count);
+    const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+    // quantile lies in this bin, no interpolation needed
+    if (index <= bins->total_count - 1) {
+      return lower_value;
+    }
+
+    // quantile lies across two bins, locate next bin if not already done
+    DCHECK_EQ(index_floor, bins->total_count - 1);
+    if (bins->right_index <= bins->left_index) {
+      bins->right_index = bins->left_index + 1;
+      while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+             this->counts[bins->right_index] == 0) {
+        ++bins->right_index;
+      }
+    }
+    DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+    DCHECK_GT(this->counts[bins->right_index], 0);
+    const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+    if (interpolation == QuantileOptions::LINEAR) {
+      return fraction * higher_value + (1 - fraction) * lower_value;
+    } else if (interpolation == QuantileOptions::MIDPOINT) {
+      return lower_value / 2 + higher_value / 2;
+    } else {
+      DCHECK(false);
+      return NAN;
+    }
+  }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+  using CType = typename InType::c_type;
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // cross point to benefit from histogram approach
+    // parameters estimated from ad-hoc benchmarks manually
+    static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
+    static constexpr int kMaxValueRange = 65536;
+
+    const Datum& datum = batch[0];
+    if (datum.length() - datum.null_count() >= kMinArraySize) {

Review comment:
       Yes. It looks still desirable to find min/max as it may not cover 64k range.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r569153961



##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
   }
 };
 
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+  using CType = typename InType::c_type;
+
+  CType min;
+  std::vector<uint64_t> counts;  // counts[i]: # of values equals i + min
+
+  // indices to adjacent non-empty bins covering current quantile
+  struct AdjacentBins {
+    int left_index;
+    int right_index;
+    uint64_t total_count;  // accumulated counts till left_index (inclusive)
+  };
+
+  CountQuantiler(CType min, CType max) {
+    uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+    DCHECK_LT(value_range, 1 << 30);
+    this->min = min;
+    this->counts.resize(value_range);
+  }
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // count values in all chunks, ignore nulls
+    const Datum& datum = batch[0];
+    const int64_t in_length = datum.length() - datum.null_count();
+    if (in_length > 0) {
+      for (auto& c : this->counts) c = 0;
+      for (const auto& array : datum.chunks()) {
+        const ArrayData& data = *array->data();
+        const CType* values = data.GetValues<CType>(1);
+        VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                            [&](int64_t pos, int64_t len) {
+                              for (int64_t i = 0; i < len; ++i) {
+                                ++this->counts[values[pos + i] - this->min];
+                              }
+                            });
+      }
+    }
+
+    // prepare out array
+    int64_t out_length = options.q.size();
+    if (in_length == 0) {
+      out_length = 0;  // input is empty or only contains null, return empty array
+    }
+    // out type depends on options
+    const bool is_datapoint = IsDataPoint(options);
+    const std::shared_ptr<DataType> out_type =
+        is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+    auto out_data = ArrayData::Make(out_type, out_length, 0);
+    out_data->buffers.resize(2, nullptr);
+
+    // calculate quantiles
+    if (out_length > 0) {
+      const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+      KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+                             ctx->Allocate(out_length * out_bit_width / 8));
+
+      // find quantiles in ascending order
+      std::vector<int64_t> q_indices(out_length);
+      std::iota(q_indices.begin(), q_indices.end(), 0);
+      std::sort(q_indices.begin(), q_indices.end(),
+                [&options](int64_t left_index, int64_t right_index) {
+                  return options.q[left_index] < options.q[right_index];
+                });
+
+      AdjacentBins bins{0, 0, this->counts[0]};
+      if (is_datapoint) {
+        CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileAtDataPoint(
+              in_length, &bins, options.q[q_index], options.interpolation);
+        }
+      } else {
+        double* out_buffer = out_data->template GetMutableValues<double>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+                                                    options.interpolation);
+        }
+      }
+    }
+
+    *out = Datum(std::move(out_data));
+  }
+
+  // return quantile located exactly at some input data point
+  CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+                               enum QuantileOptions::Interpolation interpolation) {
+    const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+    while (datapoint_index >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(datapoint_index, bins->total_count);
+    return static_cast<CType>(bins->left_index + this->min);
+  }
+
+  // return quantile interpolated from adjacent input data points
+  double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+                             enum QuantileOptions::Interpolation interpolation) {
+    const double index = (in_length - 1) * q;
+    const uint64_t index_floor = static_cast<uint64_t>(index);
+    const double fraction = index - index_floor;
+
+    while (index_floor >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(index_floor, bins->total_count);
+    const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+    // quantile lies in this bin, no interpolation needed
+    if (index <= bins->total_count - 1) {
+      return lower_value;
+    }
+
+    // quantile lies across two bins, locate next bin if not already done
+    DCHECK_EQ(index_floor, bins->total_count - 1);
+    if (bins->right_index <= bins->left_index) {
+      bins->right_index = bins->left_index + 1;
+      while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+             this->counts[bins->right_index] == 0) {
+        ++bins->right_index;
+      }
+    }
+    DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+    DCHECK_GT(this->counts[bins->right_index], 0);
+    const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+    if (interpolation == QuantileOptions::LINEAR) {
+      return fraction * higher_value + (1 - fraction) * lower_value;
+    } else if (interpolation == QuantileOptions::MIDPOINT) {
+      return lower_value / 2 + higher_value / 2;
+    } else {
+      DCHECK(false);
+      return NAN;
+    }
+  }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+  using CType = typename InType::c_type;
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // cross point to benefit from histogram approach
+    // parameters estimated from ad-hoc benchmarks manually
+    static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);

Review comment:
       Thanks for pointing this out. I confused memory size and item count. It should be 64K items, regardless of CType.
   Benchmark result at https://docs.google.com/spreadsheets/d/1AmFrdLCIEaC4MaGXUDL5nJPmFuEvjDvsTqV0rDwn4GQ/edit?usp=sharing




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-772311969


   Benchmark updated to include integers with `wide` and `narrow` value range.
   ```
   QuantileKernelInt32Wide/1048576/10000         1566 us         1566 us          447 bytes_per_second=638.552M/s null_percent=0.01 size=1048.58k
   QuantileKernelInt32Wide/1048576/100           1906 us         1906 us          367 bytes_per_second=524.742M/s null_percent=1 size=1048.58k
   QuantileKernelInt32Wide/1048576/10            2714 us         2714 us          258 bytes_per_second=368.515M/s null_percent=10 size=1048.58k
   QuantileKernelInt32Wide/1048576/2             2431 us         2431 us          288 bytes_per_second=411.435M/s null_percent=50 size=1048.58k
   QuantileKernelInt32Wide/1048576/1            0.627 us        0.627 us      1114130 bytes_per_second=1.52029T/s null_percent=100 size=1048.58k
   QuantileKernelInt32Wide/1048576/0             1864 us         1864 us          376 bytes_per_second=536.539M/s null_percent=0 size=1048.58k
   QuantileKernelInt32Narrow/1048576/10000        643 us          643 us         1088 bytes_per_second=1.51936G/s null_percent=0.01 size=1048.58k
   QuantileKernelInt32Narrow/1048576/100          743 us          743 us          945 bytes_per_second=1.31356G/s null_percent=1 size=1048.58k
   QuantileKernelInt32Narrow/1048576/10          1258 us         1258 us          557 bytes_per_second=795.072M/s null_percent=10 size=1048.58k
   QuantileKernelInt32Narrow/1048576/2           1799 us         1799 us          387 bytes_per_second=555.725M/s null_percent=50 size=1048.58k
   QuantileKernelInt32Narrow/1048576/1          0.644 us        0.644 us      1097860 bytes_per_second=1.47984T/s null_percent=100 size=1048.58k
   QuantileKernelInt32Narrow/1048576/0            635 us          635 us         1105 bytes_per_second=1.53909G/s null_percent=0 size=1048.58k
   QuantileKernelInt64Wide/1048576/10000          680 us          680 us         1034 bytes_per_second=1.43618G/s null_percent=0.01 size=1048.58k
   QuantileKernelInt64Wide/1048576/100            905 us          905 us          772 bytes_per_second=1104.74M/s null_percent=1 size=1048.58k
   QuantileKernelInt64Wide/1048576/10            1392 us         1392 us          505 bytes_per_second=718.177M/s null_percent=10 size=1048.58k
   QuantileKernelInt64Wide/1048576/2              892 us          892 us          782 bytes_per_second=1120.52M/s null_percent=50 size=1048.58k
   QuantileKernelInt64Wide/1048576/1            0.608 us        0.608 us      1138627 bytes_per_second=1.56729T/s null_percent=100 size=1048.58k
   QuantileKernelInt64Wide/1048576/0              762 us          762 us          916 bytes_per_second=1.28095G/s null_percent=0 size=1048.58k
   QuantileKernelInt64Narrow/1048576/10000        372 us          372 us         1887 bytes_per_second=2.62659G/s null_percent=0.01 size=1048.58k
   QuantileKernelInt64Narrow/1048576/100          416 us          416 us         1687 bytes_per_second=2.34746G/s null_percent=1 size=1048.58k
   QuantileKernelInt64Narrow/1048576/10           674 us          674 us         1040 bytes_per_second=1.44975G/s null_percent=10 size=1048.58k
   QuantileKernelInt64Narrow/1048576/2            869 us          869 us          807 bytes_per_second=1.12415G/s null_percent=50 size=1048.58k
   QuantileKernelInt64Narrow/1048576/1          0.609 us        0.609 us      1151528 bytes_per_second=1.56662T/s null_percent=100 size=1048.58k
   QuantileKernelInt64Narrow/1048576/0            368 us          368 us         1912 bytes_per_second=2.65377G/s null_percent=0 size=1048.58k
   QuantileKernelDouble/1048576/10000            1134 us         1134 us          614 bytes_per_second=882.09M/s null_percent=0.01 size=1048.58k
   QuantileKernelDouble/1048576/100              1241 us         1241 us          565 bytes_per_second=805.54M/s null_percent=1 size=1048.58k
   QuantileKernelDouble/1048576/10                924 us          924 us          747 bytes_per_second=1082.46M/s null_percent=10 size=1048.58k
   QuantileKernelDouble/1048576/2                 889 us          889 us          791 bytes_per_second=1124.79M/s null_percent=50 size=1048.58k
   QuantileKernelDouble/1048576/1               0.634 us        0.634 us      1127780 bytes_per_second=1.50509T/s null_percent=100 size=1048.58k
   QuantileKernelDouble/1048576/0                1229 us         1229 us          570 bytes_per_second=813.899M/s null_percent=0 size=1048.58k
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-772353862


   Travis CI passed: https://travis-ci.org/github/cyb70289/arrow/builds/757359819


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769622477


   Tested on Xeon Gold 5218, clang-9.
   Up to 2x improvements for integers with limited value range, minor drop for 50% nulls.
   
   Before
   ```
   QuantileKernelInt32/1048576/10000        2058 us         2058 us          340 bytes_per_second=485.836M/s null_percent=0.01 size=1048.58k
   QuantileKernelInt32/1048576/100          1279 us         1279 us          551 bytes_per_second=781.752M/s null_percent=1 size=1048.58k
   QuantileKernelInt32/1048576/10           1719 us         1719 us          408 bytes_per_second=581.835M/s null_percent=10 size=1048.58k
   QuantileKernelInt32/1048576/2            1631 us         1631 us          429 bytes_per_second=613.259M/s null_percent=50 size=1048.58k
   QuantileKernelInt32/1048576/1           0.601 us        0.601 us      1164926 bytes_per_second=1.58694T/s null_percent=100 size=1048.58k
   QuantileKernelInt32/1048576/0            1864 us         1864 us          376 bytes_per_second=536.431M/s null_percent=0 size=1048.58k
   
   QuantileKernelInt64/1048576/10000         944 us          944 us          741 bytes_per_second=1059.49M/s null_percent=0.01 size=1048.58k
   QuantileKernelInt64/1048576/100          1189 us         1189 us          594 bytes_per_second=840.822M/s null_percent=1 size=1048.58k
   QuantileKernelInt64/1048576/10            889 us          889 us          786 bytes_per_second=1124.47M/s null_percent=10 size=1048.58k
   QuantileKernelInt64/1048576/2             812 us          812 us          863 bytes_per_second=1.20332G/s null_percent=50 size=1048.58k
   QuantileKernelInt64/1048576/1           0.601 us        0.601 us      1162030 bytes_per_second=1.58732T/s null_percent=100 size=1048.58k
   QuantileKernelInt64/1048576/0             935 us          935 us          749 bytes_per_second=1069.31M/s null_percent=0 size=1048.58k
   
   QuantileKernelDouble/1048576/10000       1116 us         1116 us          629 bytes_per_second=895.788M/s null_percent=0.01 size=1048.58k
   QuantileKernelDouble/1048576/100         1222 us         1222 us          573 bytes_per_second=818.343M/s null_percent=1 size=1048.58k
   QuantileKernelDouble/1048576/10           902 us          902 us          775 bytes_per_second=1108.27M/s null_percent=10 size=1048.58k
   QuantileKernelDouble/1048576/2            878 us          878 us          797 bytes_per_second=1.1117G/s null_percent=50 size=1048.58k
   QuantileKernelDouble/1048576/1          0.617 us        0.617 us      1134788 bytes_per_second=1.54598T/s null_percent=100 size=1048.58k
   QuantileKernelDouble/1048576/0           1214 us         1214 us          577 bytes_per_second=823.777M/s null_percent=0 size=1048.58k
   
   ```
   
   After
   ```
   QuantileKernelInt32/1048576/10000         641 us          641 us         1091 bytes_per_second=1.52317G/s null_percent=0.01 size=1048.58k
   QuantileKernelInt32/1048576/100           745 us          745 us          935 bytes_per_second=1.31037G/s null_percent=1 size=1048.58k
   QuantileKernelInt32/1048576/10           1277 us         1277 us          549 bytes_per_second=783.305M/s null_percent=10 size=1048.58k
   QuantileKernelInt32/1048576/2            1856 us         1856 us          377 bytes_per_second=538.889M/s null_percent=50 size=1048.58k
   QuantileKernelInt32/1048576/1           0.627 us        0.627 us      1074304 bytes_per_second=1.52125T/s null_percent=100 size=1048.58k
   QuantileKernelInt32/1048576/0             632 us          632 us         1109 bytes_per_second=1.54554G/s null_percent=0 size=1048.58k
   
   QuantileKernelInt64/1048576/10000         370 us          370 us         1888 bytes_per_second=2.6384G/s null_percent=0.01 size=1048.58k
   QuantileKernelInt64/1048576/100           414 us          414 us         1681 bytes_per_second=2.36152G/s null_percent=1 size=1048.58k
   QuantileKernelInt64/1048576/10            686 us          686 us         1021 bytes_per_second=1.42442G/s null_percent=10 size=1048.58k
   QuantileKernelInt64/1048576/2             906 us          906 us          774 bytes_per_second=1104.26M/s null_percent=50 size=1048.58k
   QuantileKernelInt64/1048576/1           0.607 us        0.607 us      1155137 bytes_per_second=1.57133T/s null_percent=100 size=1048.58k
   QuantileKernelInt64/1048576/0             364 us          364 us         1920 bytes_per_second=2.6835G/s null_percent=0 size=1048.58k
   
   QuantileKernelDouble/1048576/10000       1115 us         1115 us          629 bytes_per_second=897.137M/s null_percent=0.01 size=1048.58k
   QuantileKernelDouble/1048576/100         1244 us         1244 us          564 bytes_per_second=804.11M/s null_percent=1 size=1048.58k
   QuantileKernelDouble/1048576/10           914 us          914 us          767 bytes_per_second=1093.93M/s null_percent=10 size=1048.58k
   QuantileKernelDouble/1048576/2            900 us          900 us          778 bytes_per_second=1111.09M/s null_percent=50 size=1048.58k
   QuantileKernelDouble/1048576/1          0.618 us        0.618 us      1079708 bytes_per_second=1.54343T/s null_percent=100 size=1048.58k
   QuantileKernelDouble/1048576/0           1221 us         1221 us          573 bytes_per_second=819.062M/s null_percent=0 size=1048.58k
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #9358:
URL: https://github.com/apache/arrow/pull/9358


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769620749


   https://issues.apache.org/jira/browse/ARROW-11425


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r567881299



##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
   }
 };
 
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+  using CType = typename InType::c_type;
+
+  CType min;
+  std::vector<uint64_t> counts;  // counts[i]: # of values equals i + min
+
+  // indices to adjacent non-empty bins covering current quantile
+  struct AdjacentBins {
+    int left_index;
+    int right_index;
+    uint64_t total_count;  // accumulated counts till left_index (inclusive)
+  };
+
+  CountQuantiler(CType min, CType max) {
+    uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+    DCHECK_LT(value_range, 1 << 30);
+    this->min = min;
+    this->counts.resize(value_range);
+  }
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // count values in all chunks, ignore nulls
+    const Datum& datum = batch[0];
+    const int64_t in_length = datum.length() - datum.null_count();
+    if (in_length > 0) {
+      for (auto& c : this->counts) c = 0;
+      for (const auto& array : datum.chunks()) {
+        const ArrayData& data = *array->data();
+        const CType* values = data.GetValues<CType>(1);
+        VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                            [&](int64_t pos, int64_t len) {
+                              for (int64_t i = 0; i < len; ++i) {
+                                ++this->counts[values[pos + i] - this->min];
+                              }
+                            });
+      }
+    }
+
+    // prepare out array
+    int64_t out_length = options.q.size();
+    if (in_length == 0) {
+      out_length = 0;  // input is empty or only contains null, return empty array
+    }
+    // out type depends on options
+    const bool is_datapoint = IsDataPoint(options);
+    const std::shared_ptr<DataType> out_type =
+        is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+    auto out_data = ArrayData::Make(out_type, out_length, 0);
+    out_data->buffers.resize(2, nullptr);
+
+    // calculate quantiles
+    if (out_length > 0) {
+      const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+      KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+                             ctx->Allocate(out_length * out_bit_width / 8));
+
+      // find quantiles in ascending order
+      std::vector<int64_t> q_indices(out_length);
+      std::iota(q_indices.begin(), q_indices.end(), 0);
+      std::sort(q_indices.begin(), q_indices.end(),
+                [&options](int64_t left_index, int64_t right_index) {
+                  return options.q[left_index] < options.q[right_index];
+                });
+
+      AdjacentBins bins{0, 0, this->counts[0]};
+      if (is_datapoint) {
+        CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileAtDataPoint(
+              in_length, &bins, options.q[q_index], options.interpolation);
+        }
+      } else {
+        double* out_buffer = out_data->template GetMutableValues<double>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+                                                    options.interpolation);
+        }
+      }
+    }
+
+    *out = Datum(std::move(out_data));
+  }
+
+  // return quantile located exactly at some input data point
+  CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+                               enum QuantileOptions::Interpolation interpolation) {
+    const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+    while (datapoint_index >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(datapoint_index, bins->total_count);
+    return static_cast<CType>(bins->left_index + this->min);
+  }
+
+  // return quantile interpolated from adjacent input data points
+  double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+                             enum QuantileOptions::Interpolation interpolation) {
+    const double index = (in_length - 1) * q;
+    const uint64_t index_floor = static_cast<uint64_t>(index);
+    const double fraction = index - index_floor;
+
+    while (index_floor >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(index_floor, bins->total_count);
+    const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+    // quantile lies in this bin, no interpolation needed
+    if (index <= bins->total_count - 1) {
+      return lower_value;
+    }
+
+    // quantile lies across two bins, locate next bin if not already done
+    DCHECK_EQ(index_floor, bins->total_count - 1);
+    if (bins->right_index <= bins->left_index) {
+      bins->right_index = bins->left_index + 1;
+      while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+             this->counts[bins->right_index] == 0) {
+        ++bins->right_index;
+      }
+    }
+    DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+    DCHECK_GT(this->counts[bins->right_index], 0);
+    const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+    if (interpolation == QuantileOptions::LINEAR) {
+      return fraction * higher_value + (1 - fraction) * lower_value;
+    } else if (interpolation == QuantileOptions::MIDPOINT) {
+      return lower_value / 2 + higher_value / 2;
+    } else {
+      DCHECK(false);
+      return NAN;
+    }
+  }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+  using CType = typename InType::c_type;
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // cross point to benefit from histogram approach
+    // parameters estimated from ad-hoc benchmarks manually
+    static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);

Review comment:
       I'm not sure why the heuristic would be related to `sizeof(CType)`?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
   }
 };
 
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+  using CType = typename InType::c_type;
+
+  CType min;
+  std::vector<uint64_t> counts;  // counts[i]: # of values equals i + min
+
+  // indices to adjacent non-empty bins covering current quantile
+  struct AdjacentBins {
+    int left_index;
+    int right_index;
+    uint64_t total_count;  // accumulated counts till left_index (inclusive)
+  };
+
+  CountQuantiler(CType min, CType max) {
+    uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+    DCHECK_LT(value_range, 1 << 30);
+    this->min = min;
+    this->counts.resize(value_range);
+  }
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // count values in all chunks, ignore nulls
+    const Datum& datum = batch[0];
+    const int64_t in_length = datum.length() - datum.null_count();
+    if (in_length > 0) {
+      for (auto& c : this->counts) c = 0;
+      for (const auto& array : datum.chunks()) {
+        const ArrayData& data = *array->data();
+        const CType* values = data.GetValues<CType>(1);
+        VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                            [&](int64_t pos, int64_t len) {
+                              for (int64_t i = 0; i < len; ++i) {
+                                ++this->counts[values[pos + i] - this->min];
+                              }
+                            });
+      }
+    }
+
+    // prepare out array
+    int64_t out_length = options.q.size();
+    if (in_length == 0) {
+      out_length = 0;  // input is empty or only contains null, return empty array
+    }
+    // out type depends on options
+    const bool is_datapoint = IsDataPoint(options);
+    const std::shared_ptr<DataType> out_type =
+        is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+    auto out_data = ArrayData::Make(out_type, out_length, 0);
+    out_data->buffers.resize(2, nullptr);
+
+    // calculate quantiles
+    if (out_length > 0) {
+      const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+      KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+                             ctx->Allocate(out_length * out_bit_width / 8));
+
+      // find quantiles in ascending order
+      std::vector<int64_t> q_indices(out_length);
+      std::iota(q_indices.begin(), q_indices.end(), 0);
+      std::sort(q_indices.begin(), q_indices.end(),
+                [&options](int64_t left_index, int64_t right_index) {
+                  return options.q[left_index] < options.q[right_index];
+                });
+
+      AdjacentBins bins{0, 0, this->counts[0]};
+      if (is_datapoint) {
+        CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileAtDataPoint(
+              in_length, &bins, options.q[q_index], options.interpolation);
+        }
+      } else {
+        double* out_buffer = out_data->template GetMutableValues<double>(1);
+        for (int64_t i = 0; i < out_length; ++i) {
+          const int64_t q_index = q_indices[i];
+          out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+                                                    options.interpolation);
+        }
+      }
+    }
+
+    *out = Datum(std::move(out_data));
+  }
+
+  // return quantile located exactly at some input data point
+  CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+                               enum QuantileOptions::Interpolation interpolation) {
+    const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+    while (datapoint_index >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(datapoint_index, bins->total_count);
+    return static_cast<CType>(bins->left_index + this->min);
+  }
+
+  // return quantile interpolated from adjacent input data points
+  double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+                             enum QuantileOptions::Interpolation interpolation) {
+    const double index = (in_length - 1) * q;
+    const uint64_t index_floor = static_cast<uint64_t>(index);
+    const double fraction = index - index_floor;
+
+    while (index_floor >= bins->total_count &&
+           static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+      ++bins->left_index;
+      bins->total_count += this->counts[bins->left_index];
+    }
+    DCHECK_LT(index_floor, bins->total_count);
+    const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+    // quantile lies in this bin, no interpolation needed
+    if (index <= bins->total_count - 1) {
+      return lower_value;
+    }
+
+    // quantile lies across two bins, locate next bin if not already done
+    DCHECK_EQ(index_floor, bins->total_count - 1);
+    if (bins->right_index <= bins->left_index) {
+      bins->right_index = bins->left_index + 1;
+      while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+             this->counts[bins->right_index] == 0) {
+        ++bins->right_index;
+      }
+    }
+    DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+    DCHECK_GT(this->counts[bins->right_index], 0);
+    const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+    if (interpolation == QuantileOptions::LINEAR) {
+      return fraction * higher_value + (1 - fraction) * lower_value;
+    } else if (interpolation == QuantileOptions::MIDPOINT) {
+      return lower_value / 2 + higher_value / 2;
+    } else {
+      DCHECK(false);
+      return NAN;
+    }
+  }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+  using CType = typename InType::c_type;
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // cross point to benefit from histogram approach
+    // parameters estimated from ad-hoc benchmarks manually
+    static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
+    static constexpr int kMaxValueRange = 65536;
+
+    const Datum& datum = batch[0];
+    if (datum.length() - datum.null_count() >= kMinArraySize) {

Review comment:
       Note that for int16/uint16, the max value range should always be satisfied.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769693358


   MinGW64 CI error is unrelated.
   Travis CI passed in my personal repo https://travis-ci.org/github/cyb70289/arrow/builds/756695234


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org