You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2015/06/21 19:39:41 UTC

[math] [MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.

Repository: commons-math
Updated Branches:
  refs/heads/master 75c2b24c6 -> 276e22858


[MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.


Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/276e2285
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/276e2285
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/276e2285

Branch: refs/heads/master
Commit: 276e22858cdd56f5725e0c2f7a3522df99967973
Parents: 75c2b24
Author: Thomas Neidhart <th...@gmail.com>
Authored: Sun Jun 21 19:39:23 2015 +0200
Committer: Thomas Neidhart <th...@gmail.com>
Committed: Sun Jun 21 19:39:23 2015 +0200

----------------------------------------------------------------------
 src/changes/changes.xml                         |  4 ++
 .../stat/inference/KolmogorovSmirnovTest.java   | 52 +++++---------------
 .../inference/KolmogorovSmirnovTestTest.java    | 12 +++++
 3 files changed, 28 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 13193e5..cb25351 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible trailing spaces!
     </release>
 
     <release version="4.0" date="XXXX-XX-XX" description="">
+      <action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
+        Improved performance of calculating the two-sample Kolmogorov-Smirnov
+        test statistic.
+      </action>
       <action dev="erans" type="fix" issue="MATH-1231">
         Lifted unnecessary restriction on constructor's argument of
         "MicrosphereInterpolator" (package "o.a.c.m.analysis.interpolation").

http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index 6144ff7..a664cda 100644
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -294,58 +294,30 @@ public class KolmogorovSmirnovTest {
         final int n = sx.length;
         final int m = sy.length;
 
+        int rankX = 0;
+        int rankY = 0;
+
         // Find the max difference between cdf_x and cdf_y
         double supD = 0d;
-        // First walk x points
-        for (int i = 0; i < n; i++) {
-            final double x_i = sx[i];
-            // ties can be safely ignored
-            if (i > 0 && x_i == sx[i-1]) {
-                continue;
+        do {
+            double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
+            while(rankX < n && Double.compare(sx[rankX], z) == 0) {
+                rankX += 1;
             }
-            final double cdf_x = edf(x_i, sx);
-            final double cdf_y = edf(x_i, sy);
-            final double curD = FastMath.abs(cdf_x - cdf_y);
-            if (curD > supD) {
-                supD = curD;
+            while(rankY < m && Double.compare(sy[rankY], z) == 0) {
+                rankY += 1;
             }
-        }
-        // Now look at y
-        for (int i = 0; i < m; i++) {
-            final double y_i = sy[i];
-            // ties can be safely ignored
-            if (i > 0 && y_i == sy[i-1]) {
-                continue;
-            }
-            final double cdf_x = edf(y_i, sx);
-            final double cdf_y = edf(y_i, sy);
+            final double cdf_x = rankX / (double) n;
+            final double cdf_y = rankY / (double) m;
             final double curD = FastMath.abs(cdf_x - cdf_y);
             if (curD > supD) {
                 supD = curD;
             }
-        }
+        } while(rankX < n && rankY < m);
         return supD;
     }
 
     /**
-     * Computes the empirical distribution function.
-     *
-     * @param x the given x
-     * @param samples the observations
-     * @return the empirical distribution function \(F_n(x)\)
-     */
-    private double edf(final double x, final double[] samples) {
-        final int n = samples.length;
-        int index = Arrays.binarySearch(samples, x);
-        if (index >= 0) {
-            while(index < (n - 1) && samples[index+1] == x) {
-                ++index;
-            }
-        }
-        return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
-    }
-
-    /**
      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.

http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index 0a73139..3b1c10b 100644
--- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -17,6 +17,8 @@
 
 package org.apache.commons.math4.stat.inference;
 
+import java.util.Arrays;
+
 import org.apache.commons.math4.distribution.NormalDistribution;
 import org.apache.commons.math4.distribution.UniformRealDistribution;
 import org.apache.commons.math4.random.Well19937c;
@@ -365,6 +367,16 @@ public class KolmogorovSmirnovTestTest {
 
     }
 
+    @Test
+    public void testTwoSamplesAllEqual() {
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+        for (int i = 2; i < 30; ++i) {
+            double[] values = new double[i];
+            Arrays.fill(values, i);
+            Assert.assertEquals(0., test.kolmogorovSmirnovStatistic(values, values), 0.);
+        }
+    }
+
     /**
      * Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
      * m, false).