You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/19 14:16:11 UTC

[GitHub] [arrow] cyb70289 commented on a change in pull request #8466: ARROW-10304: [C++][Compute] Optimize variance kernel for integers

cyb70289 commented on a change in pull request #8466:
URL: https://github.com/apache/arrow/pull/8466#discussion_r507784895



##########
File path: cpp/src/arrow/compute/kernels/aggregate_var_std.cc
##########
@@ -57,6 +63,54 @@ struct VarStdState {
     this->m2 = m2;
   }
 
+  // int32/16/8: textbook one pass algorithm with integer arithmetic
+  template <typename T = ArrowType>
+  enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
+      const ArrayType& array) {
+    // max number of elements that sum will not overflow int64 (2Gi int32 elements)
+    // for uint32:    0 <= sum < 2^63 (int64 >= 0)
+    // for int32: -2^62 <= sum < 2^62
+    constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
+
+    int64_t start_index = 0;
+    int64_t valid_count = array.length() - array.null_count();
+
+    while (valid_count > 0) {
+      // process in chunks that overflow will never happen
+      const auto slice = array.Slice(start_index, max_length);
+      const int64_t count = slice->length() - slice->null_count();
+      start_index += max_length;
+      valid_count -= count;
+
+      if (count > 0) {
+        int64_t sum = 0;
+        int128_t square_sum = 0;
+        VisitArrayDataInline<ArrowType>(
+            *slice->data(),
+            [&sum, &square_sum](CType value) {
+              sum += value;
+              square_sum += static_cast<uint64_t>(value) * value;
+            },
+            []() {});
+
+        const double mean = static_cast<double>(sum) / count;
+        // calculate m2 = square_sum - sum * sum / count
+        // decompose `sum * sum / count` into integers and fractions
+        const int128_t sum_square = static_cast<int128_t>(sum) * sum;
+        const int128_t integers = sum_square / count;
+        const double fractions = static_cast<double>(sum_square % count) / count;

Review comment:
       `fractions` will be a floating number less than 1. Do you mean changing this line to
   `const double fractions = static_cast<double>(sum_square - integers * count) / count;` ?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org