You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/19 12:37:47 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #8466: ARROW-10304: [C++][Compute] Optimize variance kernel for integers

pitrou commented on a change in pull request #8466:
URL: https://github.com/apache/arrow/pull/8466#discussion_r507711304



##########
File path: cpp/src/arrow/compute/kernels/aggregate_var_std.cc
##########
@@ -57,6 +63,54 @@ struct VarStdState {
     this->m2 = m2;
   }
 
+  // int32/16/8: textbook one pass algorithm with integer arithmetic
+  template <typename T = ArrowType>
+  enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
+      const ArrayType& array) {
+    // max number of elements that sum will not overflow int64 (2Gi int32 elements)
+    // for uint32:    0 <= sum < 2^63 (int64 >= 0)
+    // for int32: -2^62 <= sum < 2^62
+    constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
+
+    int64_t start_index = 0;
+    int64_t valid_count = array.length() - array.null_count();
+
+    while (valid_count > 0) {
+      // process in chunks that overflow will never happen
+      const auto slice = array.Slice(start_index, max_length);
+      const int64_t count = slice->length() - slice->null_count();
+      start_index += max_length;
+      valid_count -= count;
+
+      if (count > 0) {
+        int64_t sum = 0;
+        int128_t square_sum = 0;
+        VisitArrayDataInline<ArrowType>(
+            *slice->data(),
+            [&sum, &square_sum](CType value) {
+              sum += value;
+              square_sum += static_cast<uint64_t>(value) * value;
+            },
+            []() {});
+
+        const double mean = static_cast<double>(sum) / count;
+        // calculate m2 = square_sum - sum * sum / count
+        // decompose `sum * sum / count` into integers and fractions
+        const int128_t sum_square = static_cast<int128_t>(sum) * sum;
+        const int128_t integers = sum_square / count;
+        const double fractions = static_cast<double>(sum_square % count) / count;

Review comment:
       Or simply `sum_square - integers * count`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org