You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ps...@apache.org on 2014/05/05 01:19:43 UTC
svn commit: r1592430 - in /commons/proper/math/trunk/src: main/java/org/apache/commons/math3/stat/inference/ site/xdoc/userguide/ test/java/org/apache/commons/math3/stat/inference/

Author: psteitz
Date: Sun May  4 23:19:43 2014
New Revision: 1592430

URL: http://svn.apache.org/r1592430
Log:
Added StatUtils convenience methods and updated user guide for Kolmogorov-Smirnov tests.  JIRA: MATH-437.

Modified:
    commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/TestUtils.java
    commons/proper/math/trunk/src/site/xdoc/userguide/stat.xml
    commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java
    commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/TestUtilsTest.java

Modified: commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/TestUtils.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/TestUtils.java?rev=1592430&r1=1592429&r2=1592430&view=diff
==============================================================================
--- commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/TestUtils.java (original)
+++ commons/proper/math/trunk/src/main/java/org/apache/commons/math3/stat/inference/TestUtils.java Sun May  4 23:19:43 2014
@@ -17,8 +17,11 @@
 package org.apache.commons.math3.stat.inference;
 
 import java.util.Collection;
+
+import org.apache.commons.math3.distribution.RealDistribution;
 import org.apache.commons.math3.exception.ConvergenceException;
 import org.apache.commons.math3.exception.DimensionMismatchException;
+import org.apache.commons.math3.exception.InsufficientDataException;
 import org.apache.commons.math3.exception.MaxCountExceededException;
 import org.apache.commons.math3.exception.NoDataException;
 import org.apache.commons.math3.exception.NotPositiveException;
@@ -50,6 +53,9 @@ public class TestUtils  {
     /** Singleton G-Test instance. */
     private static final GTest G_TEST = new GTest();
 
+    /** Singleton K-S test instance */
+    private static final KolmogorovSmirnovTest KS_TEST = new KolmogorovSmirnovTest();
+
     /**
      * Prevent instantiation.
      */
@@ -449,6 +455,94 @@ public class TestUtils  {
         return G_TEST.gTestDataSetsComparison(observed1, observed2, alpha);
     }
 
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovStatistic(RealDistribution, double[])
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovStatistic(RealDistribution dist, double[] data)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovStatistic(dist, data);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovTest(RealDistribution, double[])
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovTest(RealDistribution dist, double[] data)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovTest(dist, data);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovTest(RealDistribution, double[], boolean)
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovTest(RealDistribution dist, double[] data, boolean strict)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovTest(dist, data, strict);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovTest(RealDistribution, double[], double)
+     * @since 3.3
+     */
+    public static boolean kolmogorovSmirnovTest(RealDistribution dist, double[] data, double alpha)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovTest(dist, data, alpha);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovStatistic(double[], double[])
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovStatistic(double[] x, double[] y)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovStatistic(x, y);
+    }
+
+    /**
+     * @see kolmogorovSmirnovTest(double[], double[])
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovTest(double[] x, double[] y)
+            throws InsufficientDataException, NullArgumentException {
+        return KS_TEST.kolmogorovSmirnovTest(x, y);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#kolmogorovSmirnovTest(double[], double[], boolean)
+     * @since 3.3
+     */
+    public static double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict)
+            throws InsufficientDataException, NullArgumentException  {
+        return KS_TEST.kolmogorovSmirnovTest(x, y, strict);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#exactP(double, int, int, boolean)
+     * @since 3.3
+     */
+    public static double exactP(double d, int m, int n, boolean strict) {
+        return KS_TEST.exactP(d, n, m, strict);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#approximateP(double, int, int)
+     * @since 3.3
+     */
+    public static double approximateP(double d, int n, int m) {
+        return KS_TEST.approximateP(d, n, m);
+    }
+
+    /**
+     * @see org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest#monteCarloP(double, int, int, boolean, int)
+     * @since 3.3
+     */
+    public static double monteCarloP(double d, int n, int m, boolean strict, int iterations) {
+        return KS_TEST.monteCarloP(d, n, m, strict, iterations);
+    }
+
+
     // CHECKSTYLE: resume JavadocMethodCheck
 
 }

Modified: commons/proper/math/trunk/src/site/xdoc/userguide/stat.xml
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/site/xdoc/userguide/stat.xml?rev=1592430&r1=1592429&r2=1592430&view=diff
==============================================================================
--- commons/proper/math/trunk/src/site/xdoc/userguide/stat.xml (original)
+++ commons/proper/math/trunk/src/site/xdoc/userguide/stat.xml Sun May  4 23:19:43 2014
@@ -837,7 +837,8 @@ new KendallsCorrelation().correlation(x,
           <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
           p-values</a> associated with <code>t-</code>,
           <code>Chi-Square</code>, <code>G</code>, <code>One-Way ANOVA</code>, <code>Mann-Whitney U</code>
-          and <code>Wilcoxon signed rank</code> tests. The respective test classes are
+          <code>Wilcoxon signed rank</code>, and <code>Kolmogorov-Smirnov</code> tests.
+          The respective test classes are
           <a href="../apidocs/org/apache/commons/math3/stat/inference/TTest.html">
           TTest</a>,
           <a href="../apidocs/org/apache/commons/math3/stat/inference/ChiSquareTest.html">
@@ -849,16 +850,18 @@ new KendallsCorrelation().correlation(x,
           <a href="../apidocs/org/apache/commons/math3/stat/inference/MannWhitneyUTest.html">
           MannWhitneyUTest</a>,
           <a href="../apidocs/org/apache/commons/math3/stat/inference/WilcoxonSignedRankTest.html">
-          WilcoxonSignedRankTest</a> and
+          WilcoxonSignedRankTest</a>,
           <a href="../apidocs/org/apache/commons/math3/stat/inference/BinomialTest.html">
-          BinomialTest</a>.                    
+          BinomialTest</a> and
+          <a href="../apidocs/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.html">
+          KolmogorovSmirnovTest</a>.                  
           The <a href="../apidocs/org/apache/commons/math3/stat/inference/TestUtils.html">
           TestUtils</a> class provides static methods to get test instances or
           to compute test statistics directly.  The examples below all use the
           static methods in <code>TestUtils</code> to execute tests.  To get
           test object instances, either use e.g., <code>TestUtils.getTTest()</code>
           or use the implementation constructors directly, e.g. <code>new TTest()</code>.
-        </p>
+       </p>
         <p>
           <strong>Implementation Notes</strong>
           <ul>
@@ -899,6 +902,24 @@ new KendallsCorrelation().correlation(x,
            (resp. critical values) by 2.</li>
            <li>Degrees of freedom for G- and chi-square tests are integral values, based on the
            number of observed or expected counts (number of observed counts - 1).</li>
+           <li> The KolmogorovSmirnov test uses a statistic based on the maximum deviation of
+           the empirical distribution of sample data points from the distribution expected
+           under the null hypothesis. Specifically, what is computed is
+           \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
+           \(F_n\) is the empirical distribution of the \(n\) sample data points.  Both
+           one-sample tests against a <code>RealDistribution</code> and two-sample tests
+           (comparing two empirical distributions) are supported.  For one-sample tests,
+           the distribution of \(D_n\) is estimated using the method in 
+           <a href="http://www.jstatsoft.org/v08/i18/">Evaluating Kolmogorov's Distribution</a> by
+           George Marsaglia, Wai Wan Tsang, and Jingbo Wang, with quick decisions in some cases 
+           for extreme values using the method described in
+           <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
+           Distribution</a> by Richard Simard and Pierre L'Ecuyer.  In the 2-sample case, estimation
+           by default depends on the number of data points.  For small samples, the distribution
+           is computed exactly; for moderately large samples a Monte Carlo procedure is used, and
+           for large samples a numerical approximation of the Kolmogorov distribution is used.
+           Methods to perform each type of p-value estimation are also exposed directly.  See
+           the class javadoc for details.</li>
           </ul>
           </p>
           <p>
@@ -1179,6 +1200,46 @@ TestUtils.oneWayAnovaTest(classes, 0.01)
                                           // true means reject null hypothesis
           </source>
           </dd>
+          <br/>
+          <dt><strong>Kolmogorov-Smirnov tests</strong></dt>
+          <br/>
+          <dd>Given a double[] array <code>data</code> of values, to evaluate the
+          null hypothesis that the values are drawn from a unit normal distribution
+          <source>
+final NormalDistribution unitNormal = new NormalDistribution(0d, 1d);
+TestUtils.kolmogorovSmirnovTest(unitNormal, sample, false)
+          </source>
+          returns the p-value and
+          <source>
+TestUtils.kolmogorovSmirnovStatistic(unitNormal, sample)
+          </source>
+          returns the D-statistic.
+          <br/>
+          If <code>y</code> is a double array, to evaluate the null hypothesis that
+          <code>x</code> and <code>y</code> are drawn from the same underlying distribution,
+          use
+          <source>
+TestUtils.kolmogorovSmirnovStatistic(x, y)
+          </source>
+          to compute the D-statistic and 
+          <source>
+TestUtils.kolmogorovSmirnovTest(x, y)
+          </source>
+          for the p-value associated with the null hypothesis that <code>x</code> and
+          <code>y</code> come from the same distribution. By default, here and above strict
+          inequality is used in the null hypothesis - i.e., we evaluate \(H_0 : D_{n,m} > d \).
+          To make the inequality above non-strict, add <code>false</code> as an actual parameter
+          above. For large samples, this parameter makes no difference.
+          <br/>
+          To force exact computation of the p-value (overriding the selection of estimation
+          method), first compute the d-statistic and then use the <code>exactP</code> method
+          <source>
+final double d = TestUtils.kolmogorovSmirnovStatistic(x, y);
+TestUtils.exactP(d, x.length, y.length, false)
+          </source>
+          assuming that the non-strict form of the null hypothesis is desired. Note, however,
+          that exact computation for anything but very small samples takes a very long time.    
+          </dd>
         </dl>
         </p>
       </subsection>

Modified: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java?rev=1592430&r1=1592429&r2=1592430&view=diff
==============================================================================
--- commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java (original)
+++ commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java Sun May  4 23:19:43 2014
@@ -32,10 +32,10 @@ import org.junit.Test;
  */
 public class KolmogorovSmirnovTestTest {
 
-    private static final double TOLERANCE = 10e-10;
+    protected static final double TOLERANCE = 10e-10;
 
     // Random N(0,1) values generated using R rnorm
-    private final double[] gaussian = {
+    protected static final double[] gaussian = {
         0.26055895, -0.63665233, 1.51221323, 0.61246988, -0.03013003, -1.73025682, -0.51435805, 0.70494168, 0.18242945,
         0.94734336, -0.04286604, -0.37931719, -1.07026403, -2.05861425, 0.11201862, 0.71400136, -0.52122185,
         -0.02478725, -1.86811649, -1.79907688, 0.15046279, 1.32390193, 1.55889719, 1.83149171, -0.03948003,
@@ -51,7 +51,7 @@ public class KolmogorovSmirnovTestTest {
     };
 
     // Random N(0, 1.6) values generated using R rnorm
-    private final double[] gaussian2 = {
+    protected static final double[] gaussian2 = {
         2.88041498038308, -0.632349445671017, 0.402121295225571, 0.692626364613243, 1.30693446815426,
         -0.714176317131286, -0.233169206599583, 1.09113298322107, -1.53149079994305, 1.23259966205809,
         1.01389927412503, 0.0143898711497477, -0.512813545447559, 2.79364360835469, 0.662008875538092,
@@ -75,7 +75,7 @@ public class KolmogorovSmirnovTestTest {
     };
 
     // Random uniform (0, 1) generated using R runif
-    private final double[] uniform = {
+    protected static final double[] uniform = {
         0.7930305, 0.6424382, 0.8747699, 0.7156518, 0.1845909, 0.2022326, 0.4877206, 0.8928752, 0.2293062, 0.4222006,
         0.1610459, 0.2830535, 0.9946345, 0.7329499, 0.26411126, 0.87958133, 0.29827437, 0.39185988, 0.38351185,
         0.36359611, 0.48646472, 0.05577866, 0.56152250, 0.52672013, 0.13171783, 0.95864085, 0.03060207, 0.33514887,

Modified: commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/TestUtilsTest.java
URL: http://svn.apache.org/viewvc/commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/TestUtilsTest.java?rev=1592430&r1=1592429&r2=1592430&view=diff
==============================================================================
--- commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/TestUtilsTest.java (original)
+++ commons/proper/math/trunk/src/test/java/org/apache/commons/math3/stat/inference/TestUtilsTest.java Sun May  4 23:19:43 2014
@@ -19,6 +19,7 @@ package org.apache.commons.math3.stat.in
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.commons.math3.distribution.NormalDistribution;
 import org.apache.commons.math3.exception.DimensionMismatchException;
 import org.apache.commons.math3.exception.NotPositiveException;
 import org.apache.commons.math3.exception.NotStrictlyPositiveException;
@@ -528,4 +529,30 @@ public class TestUtilsTest {
         Assert.assertEquals(FastMath.sqrt(5734.343), TestUtils.rootLogLikelihoodRatio(1000, 1000, 1000, 100000), 0.001);
         Assert.assertEquals(FastMath.sqrt(5714.932), TestUtils.rootLogLikelihoodRatio(1000, 1000, 1000, 99000), 0.001);
     }
+    
+    @Test
+    public void testKSOneSample() throws Exception {
+       final NormalDistribution unitNormal = new NormalDistribution(0d, 1d);
+       final double[] sample = KolmogorovSmirnovTestTest.gaussian;
+       final double tol = KolmogorovSmirnovTestTest.TOLERANCE;
+       Assert.assertEquals(0.3172069207622391, TestUtils.kolmogorovSmirnovTest(unitNormal, sample), tol);
+       Assert.assertEquals(0.0932947561266756, TestUtils.kolmogorovSmirnovStatistic(unitNormal, sample), tol);
+    }
+    
+    @Test
+    public void testKSTwoSample() throws Exception {
+        final double tol = KolmogorovSmirnovTestTest.TOLERANCE;
+        final double[] smallSample1 = {
+            6, 7, 9, 13, 19, 21, 22, 23, 24
+        };
+        final double[] smallSample2 = {
+            10, 11, 12, 16, 20, 27, 28, 32, 44, 54
+        };
+        Assert
+            .assertEquals(0.105577085453247, TestUtils.kolmogorovSmirnovTest(smallSample1, smallSample2, false), tol);
+        final double d = TestUtils.kolmogorovSmirnovStatistic(smallSample1, smallSample2);
+        Assert.assertEquals(0.5, d, tol);
+        Assert
+        .assertEquals(0.105577085453247, TestUtils.exactP(d, smallSample1.length,smallSample2.length, false), tol); 
+    }
 }