You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by oe...@apache.org on 2015/09/16 20:36:40 UTC
[1/2] [math] MATH-1274: representation of Kolmogorov-Smirnov
statistic as integral value
Repository: commons-math
Updated Branches:
refs/heads/MATH_3_X 1cdaba9d5 -> 1c9c43c1d
refs/heads/master b189817a3 -> fb7e1e265
MATH-1274: representation of Kolmogorov-Smirnov statistic as integral
value
Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/fb7e1e26
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/fb7e1e26
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/fb7e1e26
Branch: refs/heads/master
Commit: fb7e1e265dd9e560b3a3127a6593b6602f60026c
Parents: b189817
Author: Otmar Ertl <ot...@gmail.com>
Authored: Wed Sep 16 20:18:06 2015 +0200
Committer: Otmar Ertl <ot...@gmail.com>
Committed: Wed Sep 16 20:18:06 2015 +0200
----------------------------------------------------------------------
src/changes/changes.xml | 3 +
.../stat/inference/KolmogorovSmirnovTest.java | 138 ++++++++++++++-----
2 files changed, 109 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-math/blob/fb7e1e26/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 6ffe639..7f04ede 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -54,6 +54,9 @@ If the output is not quite correct, check for invisible trailing spaces!
</release>
<release version="4.0" date="XXXX-XX-XX" description="">
+ <action dev="oertl" type="update" issue="MATH-1274"> <!-- backported to 3.6 -->
+ Representation of Kolmogorov-Smirnov statistic as integral value.
+ </action>
<action dev="erans" type="add" issue="MATH-1270"> <!-- backported to 3.6 -->
Various SOFM visualizations (in package "o.a.c.m.ml.neuralnet.twod.util"):
Unified distance matrix, hit histogram, smoothed data histograms,
http://git-wip-us.apache.org/repos/asf/commons-math/blob/fb7e1e26/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index 7137bfe..0ae189e 100644
--- a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -22,7 +22,6 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
-import org.apache.commons.math4.util.Precision;
import org.apache.commons.math4.distribution.RealDistribution;
import org.apache.commons.math4.exception.InsufficientDataException;
import org.apache.commons.math4.exception.MathArithmeticException;
@@ -250,10 +249,10 @@ public class KolmogorovSmirnovTest {
if (hasTies(x, y)) {
return exactP(x, y, strict);
}
- return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
+ return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length);
}
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
- return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
+ return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
}
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
}
@@ -292,6 +291,25 @@ public class KolmogorovSmirnovTest {
* @throws NullArgumentException if either {@code x} or {@code y} is null
*/
public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
+ return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
+ }
+
+ /**
+ * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
+ * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
+ * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
+ * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
+ * as long value.
+ *
+ * @param x first sample
+ * @param y second sample
+ * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
+ * {@code y} represent samples from the same underlying distribution
+ * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
+ * least 2
+ * @throws NullArgumentException if either {@code x} or {@code y} is null
+ */
+ private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
checkArray(x);
checkArray(y);
// Copy and sort the sample arrays
@@ -304,23 +322,26 @@ public class KolmogorovSmirnovTest {
int rankX = 0;
int rankY = 0;
+ long curD = 0l;
// Find the max difference between cdf_x and cdf_y
- double supD = 0d;
+ long supD = 0l;
do {
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
rankX += 1;
+ curD += m;
}
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
rankY += 1;
+ curD -= n;
}
- final double cdf_x = rankX / (double) n;
- final double cdf_y = rankY / (double) m;
- final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) {
supD = curD;
}
+ else if (-curD > supD) {
+ supD = -curD;
+ }
} while(rankX < n && rankY < m);
return supD;
}
@@ -864,6 +885,32 @@ public class KolmogorovSmirnovTest {
}
/**
+ * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
+ * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
+ * comparison with other integral d-statistics. Depending whether {@code strict} is
+ * {@code true} or not, the returned value divided by (n*m) is greater than
+ * (resp greater than or equal to) the given d value (allowing some tolerance).
+ *
+ * @param d a d-statistic in the range [0, 1]
+ * @param n first sample size
+ * @param m second sample size
+ * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
+ * @return the integral d-statistic in the range [0, n*m]
+ */
+ private static long calculateIntegralD(double d, int n, int m, boolean strict) {
+ final double tol = 1e-12; // d-values within tol of one another are considered equal
+ long nm = n * (long)m;
+ long upperBound = (long)FastMath.ceil((d - tol) * nm);
+ long lowerBound = (long)FastMath.floor((d + tol) * nm);
+ if (strict && lowerBound == upperBound) {
+ return upperBound + 1l;
+ }
+ else {
+ return upperBound;
+ }
+ }
+
+ /**
* Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
@@ -888,11 +935,28 @@ public class KolmogorovSmirnovTest {
* greater than (resp. greater than or equal to) {@code d}
*/
public double exactP(double d, int n, int m, boolean strict) {
+ return integralExactP(calculateIntegralD(d, n, m, strict), n, m);
+ }
+
+ /**
+ * Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the
+ * 2-sample Kolmogorov-Smirnov statistic.
+ * <p>
+ * Here d is the D-statistic represented as long value.
+ * The real D-statistic is obtained by dividing d by n*m.
+ * See also {@link #exactP(double, int, int, boolean)}.
+ *
+ * @param d integral D-statistic
+ * @param n first sample size
+ * @param m second sample size
+ * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
+ * greater than or equal to {@code d/(n*m)}
+ */
+ private double integralExactP(long d, int n, int m) {
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
long tail = 0;
final double[] nSet = new double[n];
final double[] mSet = new double[m];
- final double tol = 1e-12; // d-values within tol of one another are considered equal
while (combinationsIterator.hasNext()) {
// Generate an n-set
final int[] nSetI = combinationsIterator.next();
@@ -906,9 +970,8 @@ public class KolmogorovSmirnovTest {
mSet[k++] = i;
}
}
- final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
- final int order = Precision.compareTo(curD, d, tol);
- if (order > 0 || (order == 0 && !strict)) {
+ final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+ if (curD >= d) {
tail++;
}
}
@@ -937,7 +1000,7 @@ public class KolmogorovSmirnovTest {
* @return p-value
*/
public double exactP(double[] x, double[] y, boolean strict) {
- final double d = kolmogorovSmirnovStatistic(x, y);
+ final long d = integralKolmogorovSmirnovStatistic(x, y);
final int n = x.length;
final int m = y.length;
@@ -952,7 +1015,6 @@ public class KolmogorovSmirnovTest {
long tail = 0;
final double[] nSet = new double[n];
final double[] mSet = new double[m];
- final double tol = 1e-12; // d-values within tol of one another are considered equal
while (combinationsIterator.hasNext()) {
// Generate an n-set
final int[] nSetI = combinationsIterator.next();
@@ -967,9 +1029,8 @@ public class KolmogorovSmirnovTest {
mSet[k++] = universe[i];
}
}
- final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
- final int order = Precision.compareTo(curD, d, tol);
- if (order > 0 || (order == 0 && !strict)) {
+ final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+ if (curD > d || (curD == d && !strict)) {
tail++;
}
}
@@ -1042,36 +1103,49 @@ public class KolmogorovSmirnovTest {
*/
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
final int iterations) {
+ return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
+ }
+
+ /**
+ * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
+ * 2-sample Kolmogorov-Smirnov statistic.
+ * <p>
+ * Here d is the D-statistic represented as long value.
+ * The real D-statistic is obtained by dividing d by n*m.
+ * See also {@link #monteCarloP(double, int, int, boolean, int)}.
+ *
+ * @param d integral D-statistic
+ * @param n first sample size
+ * @param m second sample size
+ * @param iterations number of random partitions to generate
+ * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
+ * greater than or equal to {@code d/(n*m))}
+ */
+ private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
// ensure that nn is always the max of (n, m) to require fewer random numbers
final int nn = FastMath.max(n, m);
final int mm = FastMath.min(n, m);
final int sum = nn + mm;
- final double tol = 1e-12; // d-values within tol of one another are considered equal
int tail = 0;
final boolean b[] = new boolean[sum];
for (int i = 0; i < iterations; i++) {
fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
- int rankN = b[0] ? 1 : 0;
- int rankM = b[0] ? 0 : 1;
- boolean previous = b[0];
- for(int j = 1; j < b.length; ++j) {
- if (b[j] != previous) {
- final double cdf_n = rankN / (double) nn;
- final double cdf_m = rankM / (double) mm;
- final double curD = FastMath.abs(cdf_n - cdf_m);
- final int order = Precision.compareTo(curD, d, tol);
- if (order > 0 || (order == 0 && !strict)) {
+ long curD = 0l;
+ for(int j = 0; j < b.length; ++j) {
+ if (b[j]) {
+ curD += mm;
+ if (curD >= d) {
tail++;
break;
}
- }
- previous = b[j];
- if (b[j]) {
- rankN++;
} else {
- rankM++;
+ curD -= nn;
+ if (curD <= -d) {
+ tail++;
+ break;
+ }
}
}
}
[2/2] [math] MATH-1274: representation of Kolmogorov-Smirnov
statistic as integral value
Posted by oe...@apache.org.
MATH-1274: representation of Kolmogorov-Smirnov statistic as integral
value
Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/1c9c43c1
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/1c9c43c1
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/1c9c43c1
Branch: refs/heads/MATH_3_X
Commit: 1c9c43c1d4bb76d7e47cdfc9c6681a38305a95e4
Parents: 1cdaba9
Author: Otmar Ertl <ot...@gmail.com>
Authored: Wed Sep 16 20:34:46 2015 +0200
Committer: Otmar Ertl <ot...@gmail.com>
Committed: Wed Sep 16 20:34:46 2015 +0200
----------------------------------------------------------------------
src/changes/changes.xml | 3 +
.../stat/inference/KolmogorovSmirnovTest.java | 195 ++++++++++++++++---
2 files changed, 170 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 2d1fecd..d1eaf45 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible trailing spaces!
</properties>
<body>
<release version="3.6" date="XXXX-XX-XX" description="">
+ <action dev="oertl" type="update" issue="MATH-1274">
+ Representation of Kolmogorov-Smirnov statistic as integral value.
+ </action>
<action dev="luc" type="add" issue="MATH-1273" due-to="Qualtagh">
Added negative zero support in FastMath.pow.
</action>
http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
index 74bede9..eac1546 100644
--- a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
+++ b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
@@ -21,7 +21,6 @@ import java.math.BigDecimal;
import java.util.Arrays;
import java.util.Iterator;
-import org.apache.commons.math3.util.Precision;
import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.commons.math3.exception.InsufficientDataException;
import org.apache.commons.math3.exception.MathArithmeticException;
@@ -220,7 +219,10 @@ public class KolmogorovSmirnovTest {
* {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
* is accomplished by enumerating all partitions of the combined sample into two subsamples of
* the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
- * proportion of partitions that give \(D\) values exceeding the observed value.</li>
+ * proportion of partitions that give \(D\) values exceeding the observed value. In the very
+ * small sample case, if there are ties in the data, the actual sample values (including ties)
+ * are used in generating the partitions (which are basically multi-set partitions in this
+ * case).</li>
* <li>For mid-size samples (product of sample sizes greater than or equal to
* {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
* simulation is used to compute the p-value. The simulation randomly generates partitions and
@@ -243,10 +245,10 @@ public class KolmogorovSmirnovTest {
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
final long lengthProduct = (long) x.length * y.length;
if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
- return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
+ return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length);
}
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
- return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS);
+ return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
}
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
}
@@ -285,6 +287,25 @@ public class KolmogorovSmirnovTest {
* @throws NullArgumentException if either {@code x} or {@code y} is null
*/
public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
+ return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
+ }
+
+ /**
+ * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
+ * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
+ * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
+ * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
+ * as long value.
+ *
+ * @param x first sample
+ * @param y second sample
+ * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
+ * {@code y} represent samples from the same underlying distribution
+ * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
+ * least 2
+ * @throws NullArgumentException if either {@code x} or {@code y} is null
+ */
+ private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
checkArray(x);
checkArray(y);
// Copy and sort the sample arrays
@@ -297,23 +318,26 @@ public class KolmogorovSmirnovTest {
int rankX = 0;
int rankY = 0;
+ long curD = 0l;
// Find the max difference between cdf_x and cdf_y
- double supD = 0d;
+ long supD = 0l;
do {
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
rankX += 1;
+ curD += m;
}
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
rankY += 1;
+ curD -= n;
}
- final double cdf_x = rankX / (double) n;
- final double cdf_y = rankY / (double) m;
- final double curD = FastMath.abs(cdf_x - cdf_y);
if (curD > supD) {
supD = curD;
}
+ else if (-curD > supD) {
+ supD = -curD;
+ }
} while(rankX < n && rankY < m);
return supD;
}
@@ -858,6 +882,32 @@ public class KolmogorovSmirnovTest {
}
/**
+ * Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
+ * an integral d-statistic in the range [0, n*m] is calculated, that can be used for
+ * comparison with other integral d-statistics. Depending whether {@code strict} is
+ * {@code true} or not, the returned value divided by (n*m) is greater than
+ * (resp greater than or equal to) the given d value (allowing some tolerance).
+ *
+ * @param d a d-statistic in the range [0, 1]
+ * @param n first sample size
+ * @param m second sample size
+ * @param strict whether the returned value divided by (n*m) is allowed to be equal to d
+ * @return the integral d-statistic in the range [0, n*m]
+ */
+ private static long calculateIntegralD(double d, int n, int m, boolean strict) {
+ final double tol = 1e-12; // d-values within tol of one another are considered equal
+ long nm = n * (long)m;
+ long upperBound = (long)FastMath.ceil((d - tol) * nm);
+ long lowerBound = (long)FastMath.floor((d + tol) * nm);
+ if (strict && lowerBound == upperBound) {
+ return upperBound + 1l;
+ }
+ else {
+ return upperBound;
+ }
+ }
+
+ /**
* Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
@@ -882,11 +932,28 @@ public class KolmogorovSmirnovTest {
* greater than (resp. greater than or equal to) {@code d}
*/
public double exactP(double d, int n, int m, boolean strict) {
+ return integralExactP(calculateIntegralD(d, n, m, strict), n, m);
+ }
+
+ /**
+ * Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the
+ * 2-sample Kolmogorov-Smirnov statistic.
+ * <p>
+ * Here d is the D-statistic represented as long value.
+ * The real D-statistic is obtained by dividing d by n*m.
+ * See also {@link #exactP(double, int, int, boolean)}.
+ *
+ * @param d integral D-statistic
+ * @param n first sample size
+ * @param m second sample size
+ * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
+ * greater than or equal to {@code d/(n*m)}
+ */
+ private double integralExactP(long d, int n, int m) {
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
long tail = 0;
final double[] nSet = new double[n];
final double[] mSet = new double[m];
- final double tol = 1e-12; // d-values within tol of one another are considered equal
while (combinationsIterator.hasNext()) {
// Generate an n-set
final int[] nSetI = combinationsIterator.next();
@@ -900,9 +967,67 @@ public class KolmogorovSmirnovTest {
mSet[k++] = i;
}
}
- final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
- final int order = Precision.compareTo(curD, d, tol);
- if (order > 0 || (order == 0 && !strict)) {
+ final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+ if (curD >= d) {
+ tail++;
+ }
+ }
+ return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
+ }
+
+ /**
+ * Computes the exact p value for a two-sample Kolmogorov-Smirnov test with
+ * {@code x} and {@code y} as samples, possibly containing ties. This method
+ * uses the same implementation as {@link #exactP(double, int, int, boolean)}
+ * with the exception that it examines partitions of the combined sample,
+ * preserving ties in the data. What is returned is the exact probability
+ * that a random partition of the combined dataset into a subset of size
+ * {@code x.length} and another of size {@code y.length} yields a \(D\)
+ * value greater than (resp greater than or equal to) \(D(x,y)\).
+ * <p>
+ * This method should not be used on large samples (a good rule of thumb is
+ * to keep the product of the sample sizes less than
+ * {@link #SMALL_SAMPLE_PRODUCT} when using this method). If the data do
+ * not contain ties, {@link #exactP(double[], double[], boolean)} should be
+ * used instead of this method.</p>
+ *
+ * @param x first sample
+ * @param y second sample
+ * @param strict whether or not the inequality in the null hypothesis is strict
+ * @return p-value
+ */
+ public double exactP(double[] x, double[] y, boolean strict) {
+ final long d = integralKolmogorovSmirnovStatistic(x, y);
+ final int n = x.length;
+ final int m = y.length;
+
+ // Concatenate x and y into universe, preserving ties in the data
+ final double[] universe = new double[n + m];
+ System.arraycopy(x, 0, universe, 0, n);
+ System.arraycopy(y, 0, universe, n, m);
+
+ // Iterate over all n, m partitions of the n + m elements in the universe,
+ // Computing D for each one
+ Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
+ long tail = 0;
+ final double[] nSet = new double[n];
+ final double[] mSet = new double[m];
+ while (combinationsIterator.hasNext()) {
+ // Generate an n-set
+ final int[] nSetI = combinationsIterator.next();
+ // Copy the elements of the universe in the n-set to nSet
+ // and the others to mSet
+ int j = 0;
+ int k = 0;
+ for (int i = 0; i < n + m; i++) {
+ if (j < n && nSetI[j] == i) {
+ nSet[j++] = universe[i];
+ } else {
+ mSet[k++] = universe[i];
+ }
+ }
+ final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+ if (curD > d || (curD == d && !strict)) {
tail++;
}
}
@@ -974,35 +1099,49 @@ public class KolmogorovSmirnovTest {
*/
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
final int iterations) {
+ return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
+ }
+
+ /**
+ * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
+ * 2-sample Kolmogorov-Smirnov statistic.
+ * <p>
+ * Here d is the D-statistic represented as long value.
+ * The real D-statistic is obtained by dividing d by n*m.
+ * See also {@link #monteCarloP(double, int, int, boolean, int)}.
+ *
+ * @param d integral D-statistic
+ * @param n first sample size
+ * @param m second sample size
+ * @param iterations number of random partitions to generate
+ * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
+ * greater than or equal to {@code d/(n*m))}
+ */
+ private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
+
// ensure that nn is always the max of (n, m) to require fewer random numbers
final int nn = FastMath.max(n, m);
final int mm = FastMath.min(n, m);
final int sum = nn + mm;
- final double tol = 1e-12; // d-values within tol of one another are considered equal
int tail = 0;
final boolean b[] = new boolean[sum];
for (int i = 0; i < iterations; i++) {
fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
- int rankN = b[0] ? 1 : 0;
- int rankM = b[0] ? 0 : 1;
- boolean previous = b[0];
- for(int j = 1; j < b.length; ++j) {
- if (b[j] != previous) {
- final double cdf_n = rankN / (double) nn;
- final double cdf_m = rankM / (double) mm;
- final double curD = FastMath.abs(cdf_n - cdf_m);
- final int order = Precision.compareTo(curD, d, tol);
- if (order > 0 || (order == 0 && !strict)) {
+ long curD = 0l;
+ for(int j = 0; j < b.length; ++j) {
+ if (b[j]) {
+ curD += mm;
+ if (curD >= d) {
tail++;
break;
}
- }
- previous = b[j];
- if (b[j]) {
- rankN++;
} else {
- rankM++;
+ curD -= nn;
+ if (curD <= -d) {
+ tail++;
+ break;
+ }
}
}
}