You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2023/09/26 16:24:18 UTC

[commons-statistics] 02/03: Refactor two-pass algorithm to SumOfSquaredDeviations

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-statistics.git

commit a0a258ed5016c59a35ab3a9f6878cadf44798c04
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Tue Sep 26 17:20:16 2023 +0100

    Refactor two-pass algorithm to SumOfSquaredDeviations
    
    Allows reuse in a StandardDeviation statistic
---
 .../descriptive/SumOfSquaredDeviations.java        | 96 +++++++++++++++++-----
 .../commons/statistics/descriptive/Variance.java   | 67 +++------------
 2 files changed, 89 insertions(+), 74 deletions(-)

diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
index 895b25f..a06f645 100644
--- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
+++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
@@ -46,30 +46,83 @@ package org.apache.commons.statistics.descriptive;
  * because the parallel implementation of {@link java.util.stream.Stream#collect Stream.collect()}
  * provides the necessary partitioning, isolation, and merging of results for
  * safe and efficient parallel execution.
+ *
+ * <p>References:
+ * <ul>
+ *   <li>Chan, Golub, Levesque (1983)
+ *       Algorithms for Computing the Sample Variance,
+ *       American Statistician, vol. 37, no. 3, pp. 242-247.
+ * </ul>
  */
 class SumOfSquaredDeviations extends FirstMoment {
     /** Sum of squared deviations of the values that have been added. */
-    private double squaredDevSum;
+    private double sumSquaredDev;
 
     /**
-     * Create a SumOfSquaredDeviations instance.
+     * Create an instance.
      */
     SumOfSquaredDeviations() {
         // No-op
     }
 
     /**
-     * Create a SumOfSquaredDeviations instance with the given sum of
-     * squared deviations and a FirstMoment instance.
+     * Create an instance with the given sum of
+     * squared deviations and first moment.
      *
-     * @param squaredDevSum Sum of squared deviations.
-     * @param mean Mean of values.
+     * @param sumSquaredDev Sum of squared deviations.
+     * @param m1 First moment.
      * @param n Number of values.
-     * @param nonFiniteValue Sum of values.
+     * @param nonFiniteValue Running sum of values seen so far.
+     */
+    SumOfSquaredDeviations(double sumSquaredDev, double m1, long n, double nonFiniteValue) {
+        super(m1, n, nonFiniteValue);
+        this.sumSquaredDev = sumSquaredDev;
+    }
+
+    /**
+     * Returns a {@code SumOfSquaredDeviations} instance that has the variance of all input values, or {@code NaN}
+     * if:
+     * <ul>
+     *     <li>the input array is empty,</li>
+     *     <li>any of the values is {@code NaN},</li>
+     *     <li>an infinite value of either sign is encountered, or</li>
+     *     <li>the sum of the squared deviations from the mean is infinite</li>
+     * </ul>
+     *
+     * <p>Note: {@code SumOfSquaredDeviations} computed using
+     * {@link SumOfSquaredDeviations#accept SumOfSquaredDeviations.accept()} may be different
+     * from this instance.
+     *
+     * <p>See {@link SumOfSquaredDeviations} for details on the computing algorithm.
+     *
+     * @param values Values.
+     * @return {@code SumOfSquaredDeviations} instance.
      */
-    SumOfSquaredDeviations(double squaredDevSum, double mean, long n, double nonFiniteValue) {
-        super(mean, n, nonFiniteValue);
-        this.squaredDevSum = squaredDevSum;
+    static SumOfSquaredDeviations of(double... values) {
+        // "Corrected two-pass algorithm"
+        // See: Chan et al (1983) Equation 1.7
+
+        final double m1 = FirstMoment.of(values).getFirstMoment();
+        if (!Double.isFinite(m1)) {
+            return new SumOfSquaredDeviations(Math.abs(m1), m1, values.length, m1);
+        }
+        double s = 0;
+        double ss = 0;
+        for (final double x : values) {
+            final double dx = x - m1;
+            s += dx;
+            ss += dx * dx;
+        }
+        final long n = values.length;
+        // The sum of squared deviations is ss - (s * s / n).
+        // The second term ideally should be zero; in practice it is a good approximation
+        // of the error in the first term.
+        // To prevent sumSquaredDev from spuriously attaining a NaN value
+        // when ss is infinite, assign it an infinite value which is its intended value.
+        final double sumSquaredDev = ss == Double.POSITIVE_INFINITY ?
+            Double.POSITIVE_INFINITY :
+            ss - (s * s / n);
+        return new SumOfSquaredDeviations(sumSquaredDev, m1, n, s + (m1 * n));
     }
 
     /**
@@ -78,8 +131,10 @@ class SumOfSquaredDeviations extends FirstMoment {
      */
     @Override
     public void accept(double value) {
+        // "Updating one-pass algorithm"
+        // See: Chan et al (1983) Equation 1.3b
         super.accept(value);
-        squaredDevSum += (n - 1) * dev * nDev;
+        sumSquaredDev += (n - 1) * dev * nDev;
     }
 
     /**
@@ -87,8 +142,8 @@ class SumOfSquaredDeviations extends FirstMoment {
      *
      * @return {@code SumOfSquaredDeviations} of all values seen so far.
      */
-    public double getSumOfSquaredDeviations() {
-        return Double.isFinite(getFirstMoment()) ? squaredDevSum : Double.NaN;
+    double getSumOfSquaredDeviations() {
+        return Double.isFinite(getFirstMoment()) ? sumSquaredDev : Double.NaN;
     }
 
     /**
@@ -97,15 +152,16 @@ class SumOfSquaredDeviations extends FirstMoment {
      * @param other Another {@code SumOfSquaredDeviations} to be combined.
      * @return {@code this} instance after combining {@code other}.
      */
-    public SumOfSquaredDeviations combine(SumOfSquaredDeviations other) {
-        final long oldN = n;
-        final long otherN = other.n;
-        if (oldN == 0) {
-            squaredDevSum = other.squaredDevSum;
-        } else if (otherN != 0) {
+    SumOfSquaredDeviations combine(SumOfSquaredDeviations other) {
+        final long m = other.n;
+        if (n == 0) {
+            sumSquaredDev = other.sumSquaredDev;
+        } else if (m != 0) {
+            // "Updating one-pass algorithm"
+            // See: Chan et al (1983) Equation 1.5b (modified for the mean)
             final double diffOfMean = other.getFirstMoment() - m1;
             final double sqDiffOfMean = diffOfMean * diffOfMean;
-            squaredDevSum += other.squaredDevSum + sqDiffOfMean * (((double) oldN * otherN) / ((double) oldN + otherN));
+            sumSquaredDev += other.sumSquaredDev + sqDiffOfMean * (((double) n * m) / ((double) n + m));
         }
         super.combine(other);
         return this;
diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
index 2f9efe8..21b10fe 100644
--- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
+++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
@@ -110,30 +110,7 @@ public abstract class Variance implements DoubleStatistic, DoubleStatisticAccumu
      * @return {@code Variance} instance.
      */
     public static Variance of(double... values) {
-        final double mean = FirstMoment.of(values).getFirstMoment();
-        if (!Double.isFinite(mean)) {
-            return StorelessSampleVariance.create(Math.abs(mean), mean, values.length, mean);
-        }
-        double accum = 0.0;
-        double dev;
-        double accum2 = 0.0;
-        double squaredDevSum;
-        for (final double value : values) {
-            dev = value - mean;
-            accum += dev * dev;
-            accum2 += dev;
-        }
-        final double accum2Squared = accum2 * accum2;
-        final long n = values.length;
-        // The sum of squared deviations is accum - (accum2Squared / n).
-        // To prevent squaredDevSum from spuriously attaining a NaN value
-        // when accum is infinite, assign it an infinite value which is its intended value.
-        if (accum == Double.POSITIVE_INFINITY) {
-            squaredDevSum = Double.POSITIVE_INFINITY;
-        } else {
-            squaredDevSum = accum - (accum2Squared / n);
-        }
-        return StorelessSampleVariance.create(squaredDevSum, mean, n, accum2 + (mean * n));
+        return new StorelessSampleVariance(SumOfSquaredDeviations.of(values));
     }
 
     /**
@@ -169,63 +146,45 @@ public abstract class Variance implements DoubleStatistic, DoubleStatisticAccumu
          * An instance of {@link SumOfSquaredDeviations}, which is used to
          * compute the variance.
          */
-        private final SumOfSquaredDeviations squaredDeviationSum;
+        private final SumOfSquaredDeviations ss;
 
         /**
-         * Creates a StorelessVariance instance with the sum of squared
-         * deviations from the mean.
+         * Creates an instance with the sum of squared deviations from the mean.
          *
-         * @param squaredDevSum Sum of squared deviations.
-         * @param mean Mean of values.
-         * @param n Number of values.
-         * @param sumOfValues Sum of values.
+         * @param ss Sum of squared deviations.
          */
-        private StorelessSampleVariance(double squaredDevSum, double mean, long n, double sumOfValues) {
-            squaredDeviationSum = new SumOfSquaredDeviations(squaredDevSum, mean, n, sumOfValues);
+        StorelessSampleVariance(SumOfSquaredDeviations ss) {
+            this.ss = ss;
         }
 
         /**
-         * Create a SumOfSquaredDeviations instance.
+         * Create an instance.
          */
         StorelessSampleVariance() {
-            squaredDeviationSum = new SumOfSquaredDeviations();
-        }
-
-        /**
-         * Creates a StorelessVariance instance with the sum of squared
-         * deviations from the mean.
-         *
-         * @param squaredDevSum Sum of squared deviations.
-         * @param mean Mean of values.
-         * @param n Number of values.
-         * @param sumOfValues Sum of values.
-         * @return A StorelessVariance instance.
-         */
-        static StorelessSampleVariance create(double squaredDevSum, double mean, long n, double sumOfValues) {
-            return new StorelessSampleVariance(squaredDevSum, mean, n, sumOfValues);
+            this(new SumOfSquaredDeviations());
         }
 
         @Override
         public void accept(double value) {
-            squaredDeviationSum.accept(value);
+            ss.accept(value);
         }
 
         @Override
         public double getAsDouble() {
-            final double sumOfSquaredDev = squaredDeviationSum.getSumOfSquaredDeviations();
-            final double n = squaredDeviationSum.n;
+            final double sumOfSquaredDev = ss.getSumOfSquaredDeviations();
+            final long n = ss.n;
             if (n == 0) {
                 return Double.NaN;
             } else if (n == 1 && Double.isFinite(sumOfSquaredDev)) {
                 return 0;
             }
-            return sumOfSquaredDev / (n - 1);
+            return sumOfSquaredDev / (n - 1.0);
         }
 
         @Override
         public Variance combine(Variance other) {
             final StorelessSampleVariance that = (StorelessSampleVariance) other;
-            squaredDeviationSum.combine(that.squaredDeviationSum);
+            ss.combine(that.ss);
             return this;
         }
     }