You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by tn...@apache.org on 2015/06/21 19:39:41 UTC
[math] [MATH-1236] Improve performance of calculating the two-sample
Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.
Repository: commons-math
Updated Branches:
refs/heads/master 75c2b24c6 -> 276e22858
[MATH-1236] Improve performance of calculating the two-sample Kolmogorov-Smirnov test statistic. Thanks to Otmar Ertl.
Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/276e2285
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/276e2285
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/276e2285
Branch: refs/heads/master
Commit: 276e22858cdd56f5725e0c2f7a3522df99967973
Parents: 75c2b24
Author: Thomas Neidhart <th...@gmail.com>
Authored: Sun Jun 21 19:39:23 2015 +0200
Committer: Thomas Neidhart <th...@gmail.com>
Committed: Sun Jun 21 19:39:23 2015 +0200
----------------------------------------------------------------------
src/changes/changes.xml | 4 ++
.../stat/inference/KolmogorovSmirnovTest.java | 52 +++++---------------
.../inference/KolmogorovSmirnovTestTest.java | 12 +++++
3 files changed, 28 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 13193e5..cb25351 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -54,6 +54,10 @@ If the output is not quite correct, check for invisible trailing spaces!
</release>
<release version="4.0" date="XXXX-XX-XX" description="">
+ <action dev="tn" type="fix" issue="MATH-1232" due-to="Otmar Ertl"> <!-- backported to 3.6 -->
+ Improved performance of calculating the two-sample Kolmogorov-Smirnov
+ test statistic.
+ </action>
<action dev="erans" type="fix" issue="MATH-1231">
Lifted unnecessary restriction on constructor's argument of
"MicrosphereInterpolator" (package "o.a.c.m.analysis.interpolation").
http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index 6144ff7..a664cda 100644
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -294,58 +294,30 @@ public class KolmogorovSmirnovTest {
final int n = sx.length;
final int m = sy.length;
+ int rankX = 0;
+ int rankY = 0;
+
// Find the max difference between cdf_x and cdf_y
double supD = 0d;
- // First walk x points
- for (int i = 0; i < n; i++) {
- final double x_i = sx[i];
- // ties can be safely ignored
- if (i > 0 && x_i == sx[i-1]) {
- continue;
+ do {
+ double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
+ while(rankX < n && Double.compare(sx[rankX], z) == 0) {
+ rankX += 1;
}
- final double cdf_x = edf(x_i, sx);
- final double cdf_y = edf(x_i, sy);
- final double curD = FastMath.abs(cdf_x - cdf_y);
- if (curD > supD) {
- supD = curD;
+ while(rankY < m && Double.compare(sy[rankY], z) == 0) {
+ rankY += 1;
}
- }
- // Now look at y
- for (int i = 0; i < m; i++) {
- final double y_i = sy[i];
- // ties can be safely ignored
- if (i > 0 && y_i == sy[i-1]) {
- continue;
- }
- final double cdf_x = edf(y_i, sx);
- final double cdf_y = edf(y_i, sy);
+ final double cdf_x = rankX / (double) n;
+ final double cdf_y = rankY / (double) m;
final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) {
supD = curD;
}
- }
+ } while(rankX < n && rankY < m);
return supD;
}
/**
- * Computes the empirical distribution function.
- *
- * @param x the given x
- * @param samples the observations
- * @return the empirical distribution function \(F_n(x)\)
- */
- private double edf(final double x, final double[] samples) {
- final int n = samples.length;
- int index = Arrays.binarySearch(samples, x);
- if (index >= 0) {
- while(index < (n - 1) && samples[index+1] == x) {
- ++index;
- }
- }
- return index >= 0 ? (index + 1d) / n : (-index - 1d) / n;
- }
-
- /**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
http://git-wip-us.apache.org/repos/asf/commons-math/blob/276e2285/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index 0a73139..3b1c10b 100644
--- a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -17,6 +17,8 @@
package org.apache.commons.math4.stat.inference;
+import java.util.Arrays;
+
import org.apache.commons.math4.distribution.NormalDistribution;
import org.apache.commons.math4.distribution.UniformRealDistribution;
import org.apache.commons.math4.random.Well19937c;
@@ -365,6 +367,16 @@ public class KolmogorovSmirnovTestTest {
}
+ @Test
+ public void testTwoSamplesAllEqual() {
+ final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+ for (int i = 2; i < 30; ++i) {
+ double[] values = new double[i];
+ Arrays.fill(values, i);
+ Assert.assertEquals(0., test.kolmogorovSmirnovStatistic(values, values), 0.);
+ }
+ }
+
/**
* Verifies the inequality exactP(criticalValue, n, m, true) < alpha < exactP(criticalValue, n,
* m, false).