You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2023/09/26 16:24:18 UTC
[commons-statistics] 02/03: Refactor two-pass algorithm to SumOfSquaredDeviations
This is an automated email from the ASF dual-hosted git repository.
aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-statistics.git
commit a0a258ed5016c59a35ab3a9f6878cadf44798c04
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Tue Sep 26 17:20:16 2023 +0100
Refactor two-pass algorithm to SumOfSquaredDeviations
Allows reuse in a StandardDeviation statistic
---
.../descriptive/SumOfSquaredDeviations.java | 96 +++++++++++++++++-----
.../commons/statistics/descriptive/Variance.java | 67 +++------------
2 files changed, 89 insertions(+), 74 deletions(-)
diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
index 895b25f..a06f645 100644
--- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
+++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/SumOfSquaredDeviations.java
@@ -46,30 +46,83 @@ package org.apache.commons.statistics.descriptive;
* because the parallel implementation of {@link java.util.stream.Stream#collect Stream.collect()}
* provides the necessary partitioning, isolation, and merging of results for
* safe and efficient parallel execution.
+ *
+ * <p>References:
+ * <ul>
+ * <li>Chan, Golub, Levesque (1983)
+ * Algorithms for Computing the Sample Variance,
+ * American Statistician, vol. 37, no. 3, pp. 242-247.
+ * </ul>
*/
class SumOfSquaredDeviations extends FirstMoment {
/** Sum of squared deviations of the values that have been added. */
- private double squaredDevSum;
+ private double sumSquaredDev;
/**
- * Create a SumOfSquaredDeviations instance.
+ * Create an instance.
*/
SumOfSquaredDeviations() {
// No-op
}
/**
- * Create a SumOfSquaredDeviations instance with the given sum of
- * squared deviations and a FirstMoment instance.
+ * Create an instance with the given sum of
+ * squared deviations and first moment.
*
- * @param squaredDevSum Sum of squared deviations.
- * @param mean Mean of values.
+ * @param sumSquaredDev Sum of squared deviations.
+ * @param m1 First moment.
* @param n Number of values.
- * @param nonFiniteValue Sum of values.
+ * @param nonFiniteValue Running sum of values seen so far.
+ */
+ SumOfSquaredDeviations(double sumSquaredDev, double m1, long n, double nonFiniteValue) {
+ super(m1, n, nonFiniteValue);
+ this.sumSquaredDev = sumSquaredDev;
+ }
+
+ /**
+ * Returns a {@code SumOfSquaredDeviations} instance that has the variance of all input values, or {@code NaN}
+ * if:
+ * <ul>
+ * <li>the input array is empty,</li>
+ * <li>any of the values is {@code NaN},</li>
+ * <li>an infinite value of either sign is encountered, or</li>
+ * <li>the sum of the squared deviations from the mean is infinite</li>
+ * </ul>
+ *
+ * <p>Note: {@code SumOfSquaredDeviations} computed using
+ * {@link SumOfSquaredDeviations#accept SumOfSquaredDeviations.accept()} may be different
+ * from this instance.
+ *
+ * <p>See {@link SumOfSquaredDeviations} for details on the computing algorithm.
+ *
+ * @param values Values.
+ * @return {@code SumOfSquaredDeviations} instance.
*/
- SumOfSquaredDeviations(double squaredDevSum, double mean, long n, double nonFiniteValue) {
- super(mean, n, nonFiniteValue);
- this.squaredDevSum = squaredDevSum;
+ static SumOfSquaredDeviations of(double... values) {
+ // "Corrected two-pass algorithm"
+ // See: Chan et al (1983) Equation 1.7
+
+ final double m1 = FirstMoment.of(values).getFirstMoment();
+ if (!Double.isFinite(m1)) {
+ return new SumOfSquaredDeviations(Math.abs(m1), m1, values.length, m1);
+ }
+ double s = 0;
+ double ss = 0;
+ for (final double x : values) {
+ final double dx = x - m1;
+ s += dx;
+ ss += dx * dx;
+ }
+ final long n = values.length;
+ // The sum of squared deviations is ss - (s * s / n).
+ // The second term ideally should be zero; in practice it is a good approximation
+ // of the error in the first term.
+ // To prevent sumSquaredDev from spuriously attaining a NaN value
+ // when ss is infinite, assign it an infinite value which is its intended value.
+ final double sumSquaredDev = ss == Double.POSITIVE_INFINITY ?
+ Double.POSITIVE_INFINITY :
+ ss - (s * s / n);
+ return new SumOfSquaredDeviations(sumSquaredDev, m1, n, s + (m1 * n));
}
/**
@@ -78,8 +131,10 @@ class SumOfSquaredDeviations extends FirstMoment {
*/
@Override
public void accept(double value) {
+ // "Updating one-pass algorithm"
+ // See: Chan et al (1983) Equation 1.3b
super.accept(value);
- squaredDevSum += (n - 1) * dev * nDev;
+ sumSquaredDev += (n - 1) * dev * nDev;
}
/**
@@ -87,8 +142,8 @@ class SumOfSquaredDeviations extends FirstMoment {
*
* @return {@code SumOfSquaredDeviations} of all values seen so far.
*/
- public double getSumOfSquaredDeviations() {
- return Double.isFinite(getFirstMoment()) ? squaredDevSum : Double.NaN;
+ double getSumOfSquaredDeviations() {
+ return Double.isFinite(getFirstMoment()) ? sumSquaredDev : Double.NaN;
}
/**
@@ -97,15 +152,16 @@ class SumOfSquaredDeviations extends FirstMoment {
* @param other Another {@code SumOfSquaredDeviations} to be combined.
* @return {@code this} instance after combining {@code other}.
*/
- public SumOfSquaredDeviations combine(SumOfSquaredDeviations other) {
- final long oldN = n;
- final long otherN = other.n;
- if (oldN == 0) {
- squaredDevSum = other.squaredDevSum;
- } else if (otherN != 0) {
+ SumOfSquaredDeviations combine(SumOfSquaredDeviations other) {
+ final long m = other.n;
+ if (n == 0) {
+ sumSquaredDev = other.sumSquaredDev;
+ } else if (m != 0) {
+ // "Updating one-pass algorithm"
+ // See: Chan et al (1983) Equation 1.5b (modified for the mean)
final double diffOfMean = other.getFirstMoment() - m1;
final double sqDiffOfMean = diffOfMean * diffOfMean;
- squaredDevSum += other.squaredDevSum + sqDiffOfMean * (((double) oldN * otherN) / ((double) oldN + otherN));
+ sumSquaredDev += other.sumSquaredDev + sqDiffOfMean * (((double) n * m) / ((double) n + m));
}
super.combine(other);
return this;
diff --git a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
index 2f9efe8..21b10fe 100644
--- a/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
+++ b/commons-statistics-descriptive/src/main/java/org/apache/commons/statistics/descriptive/Variance.java
@@ -110,30 +110,7 @@ public abstract class Variance implements DoubleStatistic, DoubleStatisticAccumu
* @return {@code Variance} instance.
*/
public static Variance of(double... values) {
- final double mean = FirstMoment.of(values).getFirstMoment();
- if (!Double.isFinite(mean)) {
- return StorelessSampleVariance.create(Math.abs(mean), mean, values.length, mean);
- }
- double accum = 0.0;
- double dev;
- double accum2 = 0.0;
- double squaredDevSum;
- for (final double value : values) {
- dev = value - mean;
- accum += dev * dev;
- accum2 += dev;
- }
- final double accum2Squared = accum2 * accum2;
- final long n = values.length;
- // The sum of squared deviations is accum - (accum2Squared / n).
- // To prevent squaredDevSum from spuriously attaining a NaN value
- // when accum is infinite, assign it an infinite value which is its intended value.
- if (accum == Double.POSITIVE_INFINITY) {
- squaredDevSum = Double.POSITIVE_INFINITY;
- } else {
- squaredDevSum = accum - (accum2Squared / n);
- }
- return StorelessSampleVariance.create(squaredDevSum, mean, n, accum2 + (mean * n));
+ return new StorelessSampleVariance(SumOfSquaredDeviations.of(values));
}
/**
@@ -169,63 +146,45 @@ public abstract class Variance implements DoubleStatistic, DoubleStatisticAccumu
* An instance of {@link SumOfSquaredDeviations}, which is used to
* compute the variance.
*/
- private final SumOfSquaredDeviations squaredDeviationSum;
+ private final SumOfSquaredDeviations ss;
/**
- * Creates a StorelessVariance instance with the sum of squared
- * deviations from the mean.
+ * Creates an instance with the sum of squared deviations from the mean.
*
- * @param squaredDevSum Sum of squared deviations.
- * @param mean Mean of values.
- * @param n Number of values.
- * @param sumOfValues Sum of values.
+ * @param ss Sum of squared deviations.
*/
- private StorelessSampleVariance(double squaredDevSum, double mean, long n, double sumOfValues) {
- squaredDeviationSum = new SumOfSquaredDeviations(squaredDevSum, mean, n, sumOfValues);
+ StorelessSampleVariance(SumOfSquaredDeviations ss) {
+ this.ss = ss;
}
/**
- * Create a SumOfSquaredDeviations instance.
+ * Create an instance.
*/
StorelessSampleVariance() {
- squaredDeviationSum = new SumOfSquaredDeviations();
- }
-
- /**
- * Creates a StorelessVariance instance with the sum of squared
- * deviations from the mean.
- *
- * @param squaredDevSum Sum of squared deviations.
- * @param mean Mean of values.
- * @param n Number of values.
- * @param sumOfValues Sum of values.
- * @return A StorelessVariance instance.
- */
- static StorelessSampleVariance create(double squaredDevSum, double mean, long n, double sumOfValues) {
- return new StorelessSampleVariance(squaredDevSum, mean, n, sumOfValues);
+ this(new SumOfSquaredDeviations());
}
@Override
public void accept(double value) {
- squaredDeviationSum.accept(value);
+ ss.accept(value);
}
@Override
public double getAsDouble() {
- final double sumOfSquaredDev = squaredDeviationSum.getSumOfSquaredDeviations();
- final double n = squaredDeviationSum.n;
+ final double sumOfSquaredDev = ss.getSumOfSquaredDeviations();
+ final long n = ss.n;
if (n == 0) {
return Double.NaN;
} else if (n == 1 && Double.isFinite(sumOfSquaredDev)) {
return 0;
}
- return sumOfSquaredDev / (n - 1);
+ return sumOfSquaredDev / (n - 1.0);
}
@Override
public Variance combine(Variance other) {
final StorelessSampleVariance that = (StorelessSampleVariance) other;
- squaredDeviationSum.combine(that.squaredDeviationSum);
+ ss.combine(that.ss);
return this;
}
}