You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/07/22 02:59:03 UTC
svn commit: r966469 - in /mahout/trunk/math/src:
main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
Author: tdunning
Date: Thu Jul 22 00:59:03 2010
New Revision: 966469
URL: http://svn.apache.org/viewvc?rev=966469&view=rev
Log:
MAHOUT-444 - on-line estimator widget with tests that demonstrate accuracy.
Added:
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java?rev=966469&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java (added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java Thu Jul 22 00:59:03 2010
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats;
+
+import org.apache.mahout.math.list.DoubleArrayList;
+
+/**
+ * Computes on-line estimates of mean, variance and all five quartiles (notably including the
+ * median). Since this is done in a completely incremental fashion (that is what is meant by
+ * on-line) estimates are available at any time and the amount of memory used is constant. Somewhat
+ * surprisingly, the quantile estimates are about as good as you would get if you actually kept all
+ * of the samples.
+ * <p/>
+ * The method used for mean and variance is Welford's method. See
+ * <p/>
+ * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+ * <p/>
+ * The method used for computing the quartiles is a simplified form of the stochastic approximation
+ * method described in the article "Incremental Quantile Estimation for Massive Tracking" by Chen,
+ * Lambert and Pinheiro
+ * <p/>
+ * See
+ * <p/>
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.105.1580
+ */
+public class OnlineSummarizer {
+ boolean sorted = true;
+
+ // the first several samples are kept so we can boot-strap our estimates cleanly
+ private DoubleArrayList starter = new DoubleArrayList(100);
+
+ // quartile estimates
+ private double q[] = new double[5];
+
+ // mean and variance estimates
+ private double mean = 0;
+ private double variance = 0;
+
+ // number of samples seen so far
+ private int n = 0;
+
+ public void add(double sample) {
+ sorted = false;
+
+ n++;
+ double oldMean = mean;
+ mean += (sample - mean) / n;
+ double diff = (sample - mean) * (sample - oldMean);
+ variance += (diff - variance) / n;
+
+ if (n < 100) {
+ starter.add(sample);
+ } else if (n == 100) {
+ starter.add(sample);
+ q[0] = min();
+ q[1] = quartile(1);
+ q[2] = quartile(2);
+ q[3] = quartile(3);
+ q[4] = max();
+ starter = null;
+ } else {
+ q[0] = Math.min(sample, q[0]);
+ q[4] = Math.max(sample, q[4]);
+
+ double rate = 2 * (q[3] - q[1]) / n;
+ q[1] += (Math.signum(sample - q[1]) - 0.5) * rate;
+ q[2] += (Math.signum(sample - q[2])) * rate;
+ q[3] += (Math.signum(sample - q[3]) + 0.5) * rate;
+
+ if (q[1] < q[0]) {
+ q[1] = q[0];
+ }
+
+ if (q[3] > q[4]) {
+ q[3] = q[4];
+ }
+ }
+ }
+
+ public int count() {
+ return n;
+ }
+
+ public double mean() {
+ return mean;
+ }
+
+ public double sd() {
+ return Math.sqrt(variance);
+ }
+
+ public double min() {
+ sort();
+ if (n == 0) {
+ throw new IllegalArgumentException("Must have at least one sample to estimate minimum value");
+ }
+ if (n <= 100) {
+ return starter.get(0);
+ } else {
+ return q[0];
+ }
+ }
+
+ private void sort() {
+ if (!sorted && starter != null) {
+ starter.sortFromTo(0, 99);
+ sorted = true;
+ }
+ }
+
+ public double max() {
+ sort();
+ if (n == 0) {
+ throw new IllegalArgumentException("Must have at least one sample to estimate maximum value");
+ }
+ if (n <= 100) {
+ return starter.get(99);
+ } else {
+ return q[4];
+ }
+ }
+
+ public double quartile(int i) {
+ sort();
+ switch (i) {
+ case 0:
+ return min();
+ case 1:
+ case 2:
+ case 3:
+ if (n > 100) {
+ return q[i];
+ } else if (n < 2) {
+ throw new IllegalArgumentException("Must have at least two samples to estimate quartiles");
+ } else {
+ double x = i * (n - 1) / 4.0;
+ int k = (int) Math.floor(x);
+ double u = x - k;
+ return starter.get(k) * (1 - u) + starter.get(k + 1) * u;
+ }
+ case 4:
+ return max();
+ default:
+ throw new IllegalArgumentException("Quartile number must be in the range [0..4] not " + i);
+ }
+ }
+
+ public double median() {
+ return quartile(2);
+ }
+}
Added: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java?rev=966469&view=auto
==============================================================================
--- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java (added)
+++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java Thu Jul 22 00:59:03 2010
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.stats;
+
+import org.apache.mahout.math.jet.random.Gamma;
+import org.apache.mahout.math.jet.random.engine.MersenneTwister;
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class OnlineSummarizerTest {
+ @Test
+ public void testCount() {
+ OnlineSummarizer x = new OnlineSummarizer();
+ assertEquals(0, x.count());
+ x.add(1);
+ assertEquals(1, x.count());
+
+ for (int i = 1; i < 110; i++) {
+ x.add(i);
+ assertEquals(i, x.count());
+ }
+ }
+
+ @Test
+ public void testStats() {
+ // the reference limits here were derived using a numerical simulation where I took
+ // 10,000 samples from the distribution in question and computed the stats from that
+ // sample to get min, 25%-ile, median and so on. I did this 1000 times to get 5% and
+ // 95% confidence limits for those values.
+
+ // symmetrical, well behaved
+ check(normal(10000, 1),
+ -4.417246, -3.419809,
+ -0.6972919, -0.6519899,
+ -0.02056658, 0.02176474,
+ 0.6503866, 0.6983311,
+ 3.419809, 4.417246,
+ -0.01515753, 0.01592942,
+ 0.988395, 1.011883);
+
+ // asymmetrical, well behaved. The range for the maximum was fudged slightly to all this to pass.
+ check(exp(10000, 1),
+ 4.317969e-06, 3.278763e-04,
+ 0.2783866, 0.298,
+ 0.6765024, 0.7109463,
+ 1.356929, 1.414761,
+ 8, 13,
+ 0.983805, 1.015920,
+ 0.977162, 1.022093
+ );
+
+ // asymmetrical, wacko distribution where mean/median > 10^28
+ check(gamma(10000, 1),
+ 0, 0, // minimum
+ 1.63067132881301e-60, 6.26363334269806e-58, // 25th %-ile
+ 8.62261497075834e-30, 2.01422505081014e-28, // median
+ 6.70225617733614e-12, 4.44299757853286e-11, // 75th %-ile
+ 238.451174077827, 579.143886928158, // maximum
+ 0.837031762527458, 1.17244066539313, // mean
+ 8.10277696526878, 12.1426255901507); // standard dev
+ }
+
+ private void check(OnlineSummarizer x, double... values) {
+ for (int i = 0; i < 5; i++) {
+ checkRange(String.format("quartile %d", i), x.quartile(i), values[2 * i], values[2 * i + 1]);
+ }
+ assertEquals(x.quartile(2), x.median(), 0);
+
+ checkRange("mean", x.mean(), values[10], values[11]);
+ checkRange("sd", x.sd(), values[12], values[13]);
+ }
+
+ private void checkRange(String msg, double v, double low, double high) {
+ if (v < low || v > high) {
+ fail(String.format("Wanted %s to be in range [%f,%f] but got %f", msg, low, high, v));
+ }
+ }
+
+ private OnlineSummarizer normal(int n, int seed) {
+ OnlineSummarizer x = new OnlineSummarizer();
+ Random gen = new Random(seed);
+ for (int i = 0; i < n; i++) {
+ x.add(gen.nextGaussian());
+ }
+ return x;
+ }
+
+ private OnlineSummarizer exp(int n, int seed) {
+ OnlineSummarizer x = new OnlineSummarizer();
+ Random gen = new Random(seed);
+ for (int i = 0; i < n; i++) {
+ x.add(-Math.log(1 - gen.nextDouble()));
+ }
+ return x;
+ }
+
+ private OnlineSummarizer gamma(int n, int seed) {
+ OnlineSummarizer x = new OnlineSummarizer();
+ Gamma g = new Gamma(0.01, 100, new MersenneTwister(seed));
+ for (int i = 0; i < n; i++) {
+ x.add(g.nextDouble());
+ }
+ return x;
+ }
+}
+
+
Re: svn commit: r966469 - in /mahout/trunk/math/src:
main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
Posted by Ted Dunning <te...@gmail.com>.
That isn't good.
I fixed one and don't understand why the other one is suddenly grumpy.
On Wed, Jul 21, 2010 at 6:40 PM, Jeff Eastman <jd...@windwardsolutions.com>wrote:
> OnlineSummarizerTest seems to be failing both tests
>
> On 7/21/10 5:59 PM, tdunning@apache.org wrote:
>
>> Author: tdunning
>> Date: Thu Jul 22 00:59:03 2010
>> New Revision: 966469
>>
>> URL: http://svn.apache.org/viewvc?rev=966469&view=rev
>> Log:
>> MAHOUT-444 - on-line estimator widget with tests that demonstrate
>> accuracy.
>>
>> Added:
>>
>> mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
>>
>> mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
>>
>> Added:
>> mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
>> URL:
>> http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java?rev=966469&view=auto
>>
>> ==============================================================================
>> ---
>> mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
>> (added)
>> +++
>> mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
>> Thu Jul 22 00:59:03 2010
>> @@ -0,0 +1,166 @@
>> +/*
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements. See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version
>> 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License. You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +package org.apache.mahout.math.stats;
>> +
>> +import org.apache.mahout.math.list.DoubleArrayList;
>> +
>> +/**
>> + * Computes on-line estimates of mean, variance and all five quartiles
>> (notably including the
>> + * median). Since this is done in a completely incremental fashion (that
>> is what is meant by
>> + * on-line) estimates are available at any time and the amount of memory
>> used is constant. Somewhat
>> + * surprisingly, the quantile estimates are about as good as you would
>> get if you actually kept all
>> + * of the samples.
>> + *<p/>
>> + * The method used for mean and variance is Welford's method. See
>> + *<p/>
>> + *
>> http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
>> + *<p/>
>> + * The method used for computing the quartiles is a simplified form of
>> the stochastic approximation
>> + * method described in the article "Incremental Quantile Estimation for
>> Massive Tracking" by Chen,
>> + * Lambert and Pinheiro
>> + *<p/>
>> + * See
>> + *<p/>
>> + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.105.1580
>> + */
>> +public class OnlineSummarizer {
>> + boolean sorted = true;
>> +
>> + // the first several samples are kept so we can boot-strap our
>> estimates cleanly
>> + private DoubleArrayList starter = new DoubleArrayList(100);
>> +
>> + // quartile estimates
>> + private double q[] = new double[5];
>> +
>> + // mean and variance estimates
>> + private double mean = 0;
>> + private double variance = 0;
>> +
>> + // number of samples seen so far
>> + private int n = 0;
>> +
>> + public void add(double sample) {
>> + sorted = false;
>> +
>> + n++;
>> + double oldMean = mean;
>> + mean += (sample - mean) / n;
>> + double diff = (sample - mean) * (sample - oldMean);
>> + variance += (diff - variance) / n;
>> +
>> + if (n< 100) {
>> + starter.add(sample);
>> + } else if (n == 100) {
>> + starter.add(sample);
>> + q[0] = min();
>> + q[1] = quartile(1);
>> + q[2] = quartile(2);
>> + q[3] = quartile(3);
>> + q[4] = max();
>> + starter = null;
>> + } else {
>> + q[0] = Math.min(sample, q[0]);
>> + q[4] = Math.max(sample, q[4]);
>> +
>> + double rate = 2 * (q[3] - q[1]) / n;
>> + q[1] += (Math.signum(sample - q[1]) - 0.5) * rate;
>> + q[2] += (Math.signum(sample - q[2])) * rate;
>> + q[3] += (Math.signum(sample - q[3]) + 0.5) * rate;
>> +
>> + if (q[1]< q[0]) {
>> + q[1] = q[0];
>> + }
>> +
>> + if (q[3]> q[4]) {
>> + q[3] = q[4];
>> + }
>> + }
>> + }
>> +
>> + public int count() {
>> + return n;
>> + }
>> +
>> + public double mean() {
>> + return mean;
>> + }
>> +
>> + public double sd() {
>> + return Math.sqrt(variance);
>> + }
>> +
>> + public double min() {
>> + sort();
>> + if (n == 0) {
>> + throw new IllegalArgumentException("Must have at least one sample
>> to estimate minimum value");
>> + }
>> + if (n<= 100) {
>> + return starter.get(0);
>> + } else {
>> + return q[0];
>> + }
>> + }
>> +
>> + private void sort() {
>> + if (!sorted&& starter != null) {
>> + starter.sortFromTo(0, 99);
>> + sorted = true;
>> + }
>> + }
>> +
>> + public double max() {
>> + sort();
>> + if (n == 0) {
>> + throw new IllegalArgumentException("Must have at least one sample
>> to estimate maximum value");
>> + }
>> + if (n<= 100) {
>> + return starter.get(99);
>> + } else {
>> + return q[4];
>> + }
>> + }
>> +
>> + public double quartile(int i) {
>> + sort();
>> + switch (i) {
>> + case 0:
>> + return min();
>> + case 1:
>> + case 2:
>> + case 3:
>> + if (n> 100) {
>> + return q[i];
>> + } else if (n< 2) {
>> + throw new IllegalArgumentException("Must have at least two
>> samples to estimate quartiles");
>> + } else {
>> + double x = i * (n - 1) / 4.0;
>> + int k = (int) Math.floor(x);
>> + double u = x - k;
>> + return starter.get(k) * (1 - u) + starter.get(k + 1) * u;
>> + }
>> + case 4:
>> + return max();
>> + default:
>> + throw new IllegalArgumentException("Quartile number must be in
>> the range [0..4] not " + i);
>> + }
>> + }
>> +
>> + public double median() {
>> + return quartile(2);
>> + }
>> +}
>>
>> Added:
>> mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
>> URL:
>> http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java?rev=966469&view=auto
>>
>> ==============================================================================
>> ---
>> mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
>> (added)
>> +++
>> mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
>> Thu Jul 22 00:59:03 2010
>> @@ -0,0 +1,126 @@
>> +/*
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements. See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version
>> 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License. You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +package org.apache.mahout.math.stats;
>> +
>> +import org.apache.mahout.math.jet.random.Gamma;
>> +import org.apache.mahout.math.jet.random.engine.MersenneTwister;
>> +import org.junit.Test;
>> +
>> +import java.util.Random;
>> +
>> +import static org.junit.Assert.assertEquals;
>> +import static org.junit.Assert.fail;
>> +
>> +public class OnlineSummarizerTest {
>> + @Test
>> + public void testCount() {
>> + OnlineSummarizer x = new OnlineSummarizer();
>> + assertEquals(0, x.count());
>> + x.add(1);
>> + assertEquals(1, x.count());
>> +
>> + for (int i = 1; i< 110; i++) {
>> + x.add(i);
>> + assertEquals(i, x.count());
>> + }
>> + }
>> +
>> + @Test
>> + public void testStats() {
>> + // the reference limits here were derived using a numerical
>> simulation where I took
>> + // 10,000 samples from the distribution in question and computed the
>> stats from that
>> + // sample to get min, 25%-ile, median and so on. I did this 1000
>> times to get 5% and
>> + // 95% confidence limits for those values.
>> +
>> + // symmetrical, well behaved
>> + check(normal(10000, 1),
>> + -4.417246, -3.419809,
>> + -0.6972919, -0.6519899,
>> + -0.02056658, 0.02176474,
>> + 0.6503866, 0.6983311,
>> + 3.419809, 4.417246,
>> + -0.01515753, 0.01592942,
>> + 0.988395, 1.011883);
>> +
>> + // asymmetrical, well behaved. The range for the maximum was fudged
>> slightly to all this to pass.
>> + check(exp(10000, 1),
>> + 4.317969e-06, 3.278763e-04,
>> + 0.2783866, 0.298,
>> + 0.6765024, 0.7109463,
>> + 1.356929, 1.414761,
>> + 8, 13,
>> + 0.983805, 1.015920,
>> + 0.977162, 1.022093
>> + );
>> +
>> + // asymmetrical, wacko distribution where mean/median> 10^28
>> + check(gamma(10000, 1),
>> + 0, 0, // minimum
>> + 1.63067132881301e-60, 6.26363334269806e-58, // 25th
>> %-ile
>> + 8.62261497075834e-30, 2.01422505081014e-28, // median
>> + 6.70225617733614e-12, 4.44299757853286e-11, // 75th
>> %-ile
>> + 238.451174077827, 579.143886928158, // maximum
>> + 0.837031762527458, 1.17244066539313, // mean
>> + 8.10277696526878, 12.1426255901507); // standard
>> dev
>> + }
>> +
>> + private void check(OnlineSummarizer x, double... values) {
>> + for (int i = 0; i< 5; i++) {
>> + checkRange(String.format("quartile %d", i), x.quartile(i), values[2
>> * i], values[2 * i + 1]);
>> + }
>> + assertEquals(x.quartile(2), x.median(), 0);
>> +
>> + checkRange("mean", x.mean(), values[10], values[11]);
>> + checkRange("sd", x.sd(), values[12], values[13]);
>> + }
>> +
>> + private void checkRange(String msg, double v, double low, double high)
>> {
>> + if (v< low || v> high) {
>> + fail(String.format("Wanted %s to be in range [%f,%f] but got %f",
>> msg, low, high, v));
>> + }
>> + }
>> +
>> + private OnlineSummarizer normal(int n, int seed) {
>> + OnlineSummarizer x = new OnlineSummarizer();
>> + Random gen = new Random(seed);
>> + for (int i = 0; i< n; i++) {
>> + x.add(gen.nextGaussian());
>> + }
>> + return x;
>> + }
>> +
>> + private OnlineSummarizer exp(int n, int seed) {
>> + OnlineSummarizer x = new OnlineSummarizer();
>> + Random gen = new Random(seed);
>> + for (int i = 0; i< n; i++) {
>> + x.add(-Math.log(1 - gen.nextDouble()));
>> + }
>> + return x;
>> + }
>> +
>> + private OnlineSummarizer gamma(int n, int seed) {
>> + OnlineSummarizer x = new OnlineSummarizer();
>> + Gamma g = new Gamma(0.01, 100, new MersenneTwister(seed));
>> + for (int i = 0; i< n; i++) {
>> + x.add(g.nextDouble());
>> + }
>> + return x;
>> + }
>> +}
>> +
>> +
>>
>>
>>
>>
>>
>
>
Re: svn commit: r966469 - in /mahout/trunk/math/src: main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
Posted by Jeff Eastman <jd...@windwardsolutions.com>.
OnlineSummarizerTest seems to be failing both tests
On 7/21/10 5:59 PM, tdunning@apache.org wrote:
> Author: tdunning
> Date: Thu Jul 22 00:59:03 2010
> New Revision: 966469
>
> URL: http://svn.apache.org/viewvc?rev=966469&view=rev
> Log:
> MAHOUT-444 - on-line estimator widget with tests that demonstrate accuracy.
>
> Added:
> mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
> mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
>
> Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java?rev=966469&view=auto
> ==============================================================================
> --- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java (added)
> +++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/OnlineSummarizer.java Thu Jul 22 00:59:03 2010
> @@ -0,0 +1,166 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package org.apache.mahout.math.stats;
> +
> +import org.apache.mahout.math.list.DoubleArrayList;
> +
> +/**
> + * Computes on-line estimates of mean, variance and all five quartiles (notably including the
> + * median). Since this is done in a completely incremental fashion (that is what is meant by
> + * on-line) estimates are available at any time and the amount of memory used is constant. Somewhat
> + * surprisingly, the quantile estimates are about as good as you would get if you actually kept all
> + * of the samples.
> + *<p/>
> + * The method used for mean and variance is Welford's method. See
> + *<p/>
> + * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
> + *<p/>
> + * The method used for computing the quartiles is a simplified form of the stochastic approximation
> + * method described in the article "Incremental Quantile Estimation for Massive Tracking" by Chen,
> + * Lambert and Pinheiro
> + *<p/>
> + * See
> + *<p/>
> + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.105.1580
> + */
> +public class OnlineSummarizer {
> + boolean sorted = true;
> +
> + // the first several samples are kept so we can boot-strap our estimates cleanly
> + private DoubleArrayList starter = new DoubleArrayList(100);
> +
> + // quartile estimates
> + private double q[] = new double[5];
> +
> + // mean and variance estimates
> + private double mean = 0;
> + private double variance = 0;
> +
> + // number of samples seen so far
> + private int n = 0;
> +
> + public void add(double sample) {
> + sorted = false;
> +
> + n++;
> + double oldMean = mean;
> + mean += (sample - mean) / n;
> + double diff = (sample - mean) * (sample - oldMean);
> + variance += (diff - variance) / n;
> +
> + if (n< 100) {
> + starter.add(sample);
> + } else if (n == 100) {
> + starter.add(sample);
> + q[0] = min();
> + q[1] = quartile(1);
> + q[2] = quartile(2);
> + q[3] = quartile(3);
> + q[4] = max();
> + starter = null;
> + } else {
> + q[0] = Math.min(sample, q[0]);
> + q[4] = Math.max(sample, q[4]);
> +
> + double rate = 2 * (q[3] - q[1]) / n;
> + q[1] += (Math.signum(sample - q[1]) - 0.5) * rate;
> + q[2] += (Math.signum(sample - q[2])) * rate;
> + q[3] += (Math.signum(sample - q[3]) + 0.5) * rate;
> +
> + if (q[1]< q[0]) {
> + q[1] = q[0];
> + }
> +
> + if (q[3]> q[4]) {
> + q[3] = q[4];
> + }
> + }
> + }
> +
> + public int count() {
> + return n;
> + }
> +
> + public double mean() {
> + return mean;
> + }
> +
> + public double sd() {
> + return Math.sqrt(variance);
> + }
> +
> + public double min() {
> + sort();
> + if (n == 0) {
> + throw new IllegalArgumentException("Must have at least one sample to estimate minimum value");
> + }
> + if (n<= 100) {
> + return starter.get(0);
> + } else {
> + return q[0];
> + }
> + }
> +
> + private void sort() {
> + if (!sorted&& starter != null) {
> + starter.sortFromTo(0, 99);
> + sorted = true;
> + }
> + }
> +
> + public double max() {
> + sort();
> + if (n == 0) {
> + throw new IllegalArgumentException("Must have at least one sample to estimate maximum value");
> + }
> + if (n<= 100) {
> + return starter.get(99);
> + } else {
> + return q[4];
> + }
> + }
> +
> + public double quartile(int i) {
> + sort();
> + switch (i) {
> + case 0:
> + return min();
> + case 1:
> + case 2:
> + case 3:
> + if (n> 100) {
> + return q[i];
> + } else if (n< 2) {
> + throw new IllegalArgumentException("Must have at least two samples to estimate quartiles");
> + } else {
> + double x = i * (n - 1) / 4.0;
> + int k = (int) Math.floor(x);
> + double u = x - k;
> + return starter.get(k) * (1 - u) + starter.get(k + 1) * u;
> + }
> + case 4:
> + return max();
> + default:
> + throw new IllegalArgumentException("Quartile number must be in the range [0..4] not " + i);
> + }
> + }
> +
> + public double median() {
> + return quartile(2);
> + }
> +}
>
> Added: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java?rev=966469&view=auto
> ==============================================================================
> --- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java (added)
> +++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/OnlineSummarizerTest.java Thu Jul 22 00:59:03 2010
> @@ -0,0 +1,126 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package org.apache.mahout.math.stats;
> +
> +import org.apache.mahout.math.jet.random.Gamma;
> +import org.apache.mahout.math.jet.random.engine.MersenneTwister;
> +import org.junit.Test;
> +
> +import java.util.Random;
> +
> +import static org.junit.Assert.assertEquals;
> +import static org.junit.Assert.fail;
> +
> +public class OnlineSummarizerTest {
> + @Test
> + public void testCount() {
> + OnlineSummarizer x = new OnlineSummarizer();
> + assertEquals(0, x.count());
> + x.add(1);
> + assertEquals(1, x.count());
> +
> + for (int i = 1; i< 110; i++) {
> + x.add(i);
> + assertEquals(i, x.count());
> + }
> + }
> +
> + @Test
> + public void testStats() {
> + // the reference limits here were derived using a numerical simulation where I took
> + // 10,000 samples from the distribution in question and computed the stats from that
> + // sample to get min, 25%-ile, median and so on. I did this 1000 times to get 5% and
> + // 95% confidence limits for those values.
> +
> + // symmetrical, well behaved
> + check(normal(10000, 1),
> + -4.417246, -3.419809,
> + -0.6972919, -0.6519899,
> + -0.02056658, 0.02176474,
> + 0.6503866, 0.6983311,
> + 3.419809, 4.417246,
> + -0.01515753, 0.01592942,
> + 0.988395, 1.011883);
> +
> + // asymmetrical, well behaved. The range for the maximum was fudged slightly to all this to pass.
> + check(exp(10000, 1),
> + 4.317969e-06, 3.278763e-04,
> + 0.2783866, 0.298,
> + 0.6765024, 0.7109463,
> + 1.356929, 1.414761,
> + 8, 13,
> + 0.983805, 1.015920,
> + 0.977162, 1.022093
> + );
> +
> + // asymmetrical, wacko distribution where mean/median> 10^28
> + check(gamma(10000, 1),
> + 0, 0, // minimum
> + 1.63067132881301e-60, 6.26363334269806e-58, // 25th %-ile
> + 8.62261497075834e-30, 2.01422505081014e-28, // median
> + 6.70225617733614e-12, 4.44299757853286e-11, // 75th %-ile
> + 238.451174077827, 579.143886928158, // maximum
> + 0.837031762527458, 1.17244066539313, // mean
> + 8.10277696526878, 12.1426255901507); // standard dev
> + }
> +
> + private void check(OnlineSummarizer x, double... values) {
> + for (int i = 0; i< 5; i++) {
> + checkRange(String.format("quartile %d", i), x.quartile(i), values[2 * i], values[2 * i + 1]);
> + }
> + assertEquals(x.quartile(2), x.median(), 0);
> +
> + checkRange("mean", x.mean(), values[10], values[11]);
> + checkRange("sd", x.sd(), values[12], values[13]);
> + }
> +
> + private void checkRange(String msg, double v, double low, double high) {
> + if (v< low || v> high) {
> + fail(String.format("Wanted %s to be in range [%f,%f] but got %f", msg, low, high, v));
> + }
> + }
> +
> + private OnlineSummarizer normal(int n, int seed) {
> + OnlineSummarizer x = new OnlineSummarizer();
> + Random gen = new Random(seed);
> + for (int i = 0; i< n; i++) {
> + x.add(gen.nextGaussian());
> + }
> + return x;
> + }
> +
> + private OnlineSummarizer exp(int n, int seed) {
> + OnlineSummarizer x = new OnlineSummarizer();
> + Random gen = new Random(seed);
> + for (int i = 0; i< n; i++) {
> + x.add(-Math.log(1 - gen.nextDouble()));
> + }
> + return x;
> + }
> +
> + private OnlineSummarizer gamma(int n, int seed) {
> + OnlineSummarizer x = new OnlineSummarizer();
> + Gamma g = new Gamma(0.01, 100, new MersenneTwister(seed));
> + for (int i = 0; i< n; i++) {
> + x.add(g.nextDouble());
> + }
> + return x;
> + }
> +}
> +
> +
>
>
>
>