You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/29 07:02:21 UTC
[GitHub] [arrow] cyb70289 opened a new pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
cyb70289 opened a new pull request #9358:
URL: https://github.com/apache/arrow/pull/9358
For integers within limited value range, a histogram approach is useful
to reduce memory footprint and improve performance.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r569193045
##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
}
};
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range);
+ }
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ for (auto& c : this->counts) c = 0;
+ for (const auto& array : datum.chunks()) {
+ const ArrayData& data = *array->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++this->counts[values[pos + i] - this->min];
+ }
+ });
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+ KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+ ctx->Allocate(out_length * out_bit_width / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
+ static constexpr int kMaxValueRange = 65536;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
Review comment:
Yes. It looks still desirable to find min/max as it may not cover 64k range.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r569153961
##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
}
};
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range);
+ }
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ for (auto& c : this->counts) c = 0;
+ for (const auto& array : datum.chunks()) {
+ const ArrayData& data = *array->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++this->counts[values[pos + i] - this->min];
+ }
+ });
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+ KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+ ctx->Allocate(out_length * out_bit_width / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
Review comment:
Thanks for pointing this out. I confused memory size and item count. It should be 64K items, regardless of CType.
Benchmark result at https://docs.google.com/spreadsheets/d/1AmFrdLCIEaC4MaGXUDL5nJPmFuEvjDvsTqV0rDwn4GQ/edit?usp=sharing
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-772311969
Benchmark updated to include integers with `wide` and `narrow` value range.
```
QuantileKernelInt32Wide/1048576/10000 1566 us 1566 us 447 bytes_per_second=638.552M/s null_percent=0.01 size=1048.58k
QuantileKernelInt32Wide/1048576/100 1906 us 1906 us 367 bytes_per_second=524.742M/s null_percent=1 size=1048.58k
QuantileKernelInt32Wide/1048576/10 2714 us 2714 us 258 bytes_per_second=368.515M/s null_percent=10 size=1048.58k
QuantileKernelInt32Wide/1048576/2 2431 us 2431 us 288 bytes_per_second=411.435M/s null_percent=50 size=1048.58k
QuantileKernelInt32Wide/1048576/1 0.627 us 0.627 us 1114130 bytes_per_second=1.52029T/s null_percent=100 size=1048.58k
QuantileKernelInt32Wide/1048576/0 1864 us 1864 us 376 bytes_per_second=536.539M/s null_percent=0 size=1048.58k
QuantileKernelInt32Narrow/1048576/10000 643 us 643 us 1088 bytes_per_second=1.51936G/s null_percent=0.01 size=1048.58k
QuantileKernelInt32Narrow/1048576/100 743 us 743 us 945 bytes_per_second=1.31356G/s null_percent=1 size=1048.58k
QuantileKernelInt32Narrow/1048576/10 1258 us 1258 us 557 bytes_per_second=795.072M/s null_percent=10 size=1048.58k
QuantileKernelInt32Narrow/1048576/2 1799 us 1799 us 387 bytes_per_second=555.725M/s null_percent=50 size=1048.58k
QuantileKernelInt32Narrow/1048576/1 0.644 us 0.644 us 1097860 bytes_per_second=1.47984T/s null_percent=100 size=1048.58k
QuantileKernelInt32Narrow/1048576/0 635 us 635 us 1105 bytes_per_second=1.53909G/s null_percent=0 size=1048.58k
QuantileKernelInt64Wide/1048576/10000 680 us 680 us 1034 bytes_per_second=1.43618G/s null_percent=0.01 size=1048.58k
QuantileKernelInt64Wide/1048576/100 905 us 905 us 772 bytes_per_second=1104.74M/s null_percent=1 size=1048.58k
QuantileKernelInt64Wide/1048576/10 1392 us 1392 us 505 bytes_per_second=718.177M/s null_percent=10 size=1048.58k
QuantileKernelInt64Wide/1048576/2 892 us 892 us 782 bytes_per_second=1120.52M/s null_percent=50 size=1048.58k
QuantileKernelInt64Wide/1048576/1 0.608 us 0.608 us 1138627 bytes_per_second=1.56729T/s null_percent=100 size=1048.58k
QuantileKernelInt64Wide/1048576/0 762 us 762 us 916 bytes_per_second=1.28095G/s null_percent=0 size=1048.58k
QuantileKernelInt64Narrow/1048576/10000 372 us 372 us 1887 bytes_per_second=2.62659G/s null_percent=0.01 size=1048.58k
QuantileKernelInt64Narrow/1048576/100 416 us 416 us 1687 bytes_per_second=2.34746G/s null_percent=1 size=1048.58k
QuantileKernelInt64Narrow/1048576/10 674 us 674 us 1040 bytes_per_second=1.44975G/s null_percent=10 size=1048.58k
QuantileKernelInt64Narrow/1048576/2 869 us 869 us 807 bytes_per_second=1.12415G/s null_percent=50 size=1048.58k
QuantileKernelInt64Narrow/1048576/1 0.609 us 0.609 us 1151528 bytes_per_second=1.56662T/s null_percent=100 size=1048.58k
QuantileKernelInt64Narrow/1048576/0 368 us 368 us 1912 bytes_per_second=2.65377G/s null_percent=0 size=1048.58k
QuantileKernelDouble/1048576/10000 1134 us 1134 us 614 bytes_per_second=882.09M/s null_percent=0.01 size=1048.58k
QuantileKernelDouble/1048576/100 1241 us 1241 us 565 bytes_per_second=805.54M/s null_percent=1 size=1048.58k
QuantileKernelDouble/1048576/10 924 us 924 us 747 bytes_per_second=1082.46M/s null_percent=10 size=1048.58k
QuantileKernelDouble/1048576/2 889 us 889 us 791 bytes_per_second=1124.79M/s null_percent=50 size=1048.58k
QuantileKernelDouble/1048576/1 0.634 us 0.634 us 1127780 bytes_per_second=1.50509T/s null_percent=100 size=1048.58k
QuantileKernelDouble/1048576/0 1229 us 1229 us 570 bytes_per_second=813.899M/s null_percent=0 size=1048.58k
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-772353862
Travis CI passed: https://travis-ci.org/github/cyb70289/arrow/builds/757359819
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769622477
Tested on Xeon Gold 5218, clang-9.
Up to 2x improvements for integers with limited value range, minor drop for 50% nulls.
Before
```
QuantileKernelInt32/1048576/10000 2058 us 2058 us 340 bytes_per_second=485.836M/s null_percent=0.01 size=1048.58k
QuantileKernelInt32/1048576/100 1279 us 1279 us 551 bytes_per_second=781.752M/s null_percent=1 size=1048.58k
QuantileKernelInt32/1048576/10 1719 us 1719 us 408 bytes_per_second=581.835M/s null_percent=10 size=1048.58k
QuantileKernelInt32/1048576/2 1631 us 1631 us 429 bytes_per_second=613.259M/s null_percent=50 size=1048.58k
QuantileKernelInt32/1048576/1 0.601 us 0.601 us 1164926 bytes_per_second=1.58694T/s null_percent=100 size=1048.58k
QuantileKernelInt32/1048576/0 1864 us 1864 us 376 bytes_per_second=536.431M/s null_percent=0 size=1048.58k
QuantileKernelInt64/1048576/10000 944 us 944 us 741 bytes_per_second=1059.49M/s null_percent=0.01 size=1048.58k
QuantileKernelInt64/1048576/100 1189 us 1189 us 594 bytes_per_second=840.822M/s null_percent=1 size=1048.58k
QuantileKernelInt64/1048576/10 889 us 889 us 786 bytes_per_second=1124.47M/s null_percent=10 size=1048.58k
QuantileKernelInt64/1048576/2 812 us 812 us 863 bytes_per_second=1.20332G/s null_percent=50 size=1048.58k
QuantileKernelInt64/1048576/1 0.601 us 0.601 us 1162030 bytes_per_second=1.58732T/s null_percent=100 size=1048.58k
QuantileKernelInt64/1048576/0 935 us 935 us 749 bytes_per_second=1069.31M/s null_percent=0 size=1048.58k
QuantileKernelDouble/1048576/10000 1116 us 1116 us 629 bytes_per_second=895.788M/s null_percent=0.01 size=1048.58k
QuantileKernelDouble/1048576/100 1222 us 1222 us 573 bytes_per_second=818.343M/s null_percent=1 size=1048.58k
QuantileKernelDouble/1048576/10 902 us 902 us 775 bytes_per_second=1108.27M/s null_percent=10 size=1048.58k
QuantileKernelDouble/1048576/2 878 us 878 us 797 bytes_per_second=1.1117G/s null_percent=50 size=1048.58k
QuantileKernelDouble/1048576/1 0.617 us 0.617 us 1134788 bytes_per_second=1.54598T/s null_percent=100 size=1048.58k
QuantileKernelDouble/1048576/0 1214 us 1214 us 577 bytes_per_second=823.777M/s null_percent=0 size=1048.58k
```
After
```
QuantileKernelInt32/1048576/10000 641 us 641 us 1091 bytes_per_second=1.52317G/s null_percent=0.01 size=1048.58k
QuantileKernelInt32/1048576/100 745 us 745 us 935 bytes_per_second=1.31037G/s null_percent=1 size=1048.58k
QuantileKernelInt32/1048576/10 1277 us 1277 us 549 bytes_per_second=783.305M/s null_percent=10 size=1048.58k
QuantileKernelInt32/1048576/2 1856 us 1856 us 377 bytes_per_second=538.889M/s null_percent=50 size=1048.58k
QuantileKernelInt32/1048576/1 0.627 us 0.627 us 1074304 bytes_per_second=1.52125T/s null_percent=100 size=1048.58k
QuantileKernelInt32/1048576/0 632 us 632 us 1109 bytes_per_second=1.54554G/s null_percent=0 size=1048.58k
QuantileKernelInt64/1048576/10000 370 us 370 us 1888 bytes_per_second=2.6384G/s null_percent=0.01 size=1048.58k
QuantileKernelInt64/1048576/100 414 us 414 us 1681 bytes_per_second=2.36152G/s null_percent=1 size=1048.58k
QuantileKernelInt64/1048576/10 686 us 686 us 1021 bytes_per_second=1.42442G/s null_percent=10 size=1048.58k
QuantileKernelInt64/1048576/2 906 us 906 us 774 bytes_per_second=1104.26M/s null_percent=50 size=1048.58k
QuantileKernelInt64/1048576/1 0.607 us 0.607 us 1155137 bytes_per_second=1.57133T/s null_percent=100 size=1048.58k
QuantileKernelInt64/1048576/0 364 us 364 us 1920 bytes_per_second=2.6835G/s null_percent=0 size=1048.58k
QuantileKernelDouble/1048576/10000 1115 us 1115 us 629 bytes_per_second=897.137M/s null_percent=0.01 size=1048.58k
QuantileKernelDouble/1048576/100 1244 us 1244 us 564 bytes_per_second=804.11M/s null_percent=1 size=1048.58k
QuantileKernelDouble/1048576/10 914 us 914 us 767 bytes_per_second=1093.93M/s null_percent=10 size=1048.58k
QuantileKernelDouble/1048576/2 900 us 900 us 778 bytes_per_second=1111.09M/s null_percent=50 size=1048.58k
QuantileKernelDouble/1048576/1 0.618 us 0.618 us 1079708 bytes_per_second=1.54343T/s null_percent=100 size=1048.58k
QuantileKernelDouble/1048576/0 1221 us 1221 us 573 bytes_per_second=819.062M/s null_percent=0 size=1048.58k
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou closed pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #9358:
URL: https://github.com/apache/arrow/pull/9358
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769620749
https://issues.apache.org/jira/browse/ARROW-11425
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#discussion_r567881299
##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
}
};
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range);
+ }
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ for (auto& c : this->counts) c = 0;
+ for (const auto& array : datum.chunks()) {
+ const ArrayData& data = *array->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++this->counts[values[pos + i] - this->min];
+ }
+ });
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+ KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+ ctx->Allocate(out_length * out_bit_width / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
Review comment:
I'm not sure why the heuristic would be related to `sizeof(CType)`?
##########
File path: cpp/src/arrow/compute/kernels/aggregate_quantile.cc
##########
@@ -239,6 +229,243 @@ struct QuantileExecutor {
}
};
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range);
+ }
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ for (auto& c : this->counts) c = 0;
+ for (const auto& array : datum.chunks()) {
+ const ArrayData& data = *array->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++this->counts[values[pos + i] - this->min];
+ }
+ });
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ const auto out_bit_width = checked_pointer_cast<NumberType>(out_type)->bit_width();
+ KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
+ ctx->Allocate(out_length * out_bit_width / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or sort approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536 * sizeof(int) / sizeof(CType);
+ static constexpr int kMaxValueRange = 65536;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
Review comment:
Note that for int16/uint16, the max value range should always be satisfied.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] cyb70289 commented on pull request #9358: ARROW-11425: [C++][Compute] Optimize quantile kernel for integers
Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #9358:
URL: https://github.com/apache/arrow/pull/9358#issuecomment-769693358
MinGW64 CI error is unrelated.
Travis CI passed in my personal repo https://travis-ci.org/github/cyb70289/arrow/builds/756695234
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org