You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/12/18 00:22:41 UTC
svn commit: r891983 [17/47] - in /lucene/mahout/trunk: ./ core/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/a...
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,292 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
+is hereby granted without fee, provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear in supporting documentation.
+CERN makes no representations about the suitability of this software for any purpose.
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.apache.mahout.math.jet.math.Arithmetic;
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.jet.random.sampling.RandomSamplingAssistant;
+import org.apache.mahout.math.list.DoubleArrayList;
+
+/**
+ * Approximate quantile finding algorithm for known <tt>N</tt> requiring only one pass and little main memory; computes
+ * quantiles over a sequence of <tt>double</tt> elements.
+ *
+ * <p>Needs as input the following parameters:<p> <dt>1. <tt>N</tt> - the number of values of the data sequence over
+ * which quantiles are to be determined. <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. <dt>3.
+ * <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is
+ * explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory
+ * requirements. With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with
+ * respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>4. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set
+ * <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * You usually don't instantiate quantile finders by using the constructor. Instead use the factory
+ * <tt>QuantileFinderFactor</tt> to do so. It will set up the right parametrization for you.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, Approximate Medians and other Quantiles in
+ * One Pass and with Limited Memory, Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data, Paper available <A
+ * HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
+ *
+ * @see QuantileFinderFactory
+ */
+class KnownDoubleQuantileEstimator extends DoubleQuantileEstimator {
+
+ protected double beta; //correction factor for phis
+
+ protected boolean weHadMoreThanOneEmptyBuffer;
+
+ protected RandomSamplingAssistant samplingAssistant;
+ protected final double samplingRate; // see method sampleNextElement()
+ protected final long N; // see method sampleNextElement()
+
+ /**
+ * Constructs an approximate quantile finder with b buffers, each having k elements.
+ *
+ * @param b the number of buffers
+ * @param k the number of elements per buffer
+ * @param N the total number of elements over which quantiles are to be computed.
+ * @param samplingRate 1.0 --> all elements are consumed. 10.0 --> Consumes one random element from successive blocks
+ * of 10 elements each. Etc.
+ * @param generator a uniform random number generator.
+ */
+ KnownDoubleQuantileEstimator(int b, int k, long N, double samplingRate, RandomEngine generator) {
+ this.samplingRate = samplingRate;
+ this.N = N;
+
+ if (this.samplingRate <= 1.0) {
+ this.samplingAssistant = null;
+ } else {
+ this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N / samplingRate), N, generator);
+ }
+
+ setUp(b, k);
+ this.clear();
+ }
+
+ /**
+ * @param missingInfinities the number of infinities to fill.
+ * @param buffer the buffer into which the infinities shall be filled.
+ */
+ protected void addInfinities(int missingInfinities, DoubleBuffer buffer) {
+ RandomSamplingAssistant oldAssistant = this.samplingAssistant;
+ this.samplingAssistant = null; // switch off sampler
+ //double[] infinities = new double[missingInfinities];
+
+ boolean even = true;
+ for (int i = 0; i < missingInfinities; i++) {
+ if (even) {
+ buffer.values.add(Double.MAX_VALUE);
+ } else {
+ buffer.values.add(-Double.MAX_VALUE);
+ }
+
+ //if (even) {infinities[i]=Double.MAX_VALUE;}
+ //else {infinities[i]=-Double.MAX_VALUE;}
+
+ //if (even) {this.add(Double.MAX_VALUE);}
+ //else {this.add(-Double.MAX_VALUE);}
+ even = !even;
+ }
+
+ //buffer.values.addAllOfFromTo(new DoubleArrayList(infinities),0,missingInfinities-1);
+
+ //this.totalElementsFilled -= infinities;
+
+ this.samplingAssistant = oldAssistant; // switch on sampler again
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected DoubleBuffer[] buffersToCollapse() {
+ int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
+ return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+ }
+
+ /**
+ * Removes all elements from the receiver. The receiver will be empty after this call returns, and its memory
+ * requirements will be close to zero.
+ */
+ @Override
+ public void clear() {
+ super.clear();
+ this.beta = 1.0;
+ this.weHadMoreThanOneEmptyBuffer = false;
+ //this.setSamplingRate(samplingRate,N);
+
+ RandomSamplingAssistant assist = this.samplingAssistant;
+ if (assist != null) {
+ this.samplingAssistant =
+ new RandomSamplingAssistant(Arithmetic.floor(N / samplingRate), N, assist.getRandomGenerator());
+ }
+ }
+
+ /**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+ @Override
+ public Object clone() {
+ KnownDoubleQuantileEstimator copy = (KnownDoubleQuantileEstimator) super.clone();
+ if (this.samplingAssistant != null) {
+ copy.samplingAssistant = (RandomSamplingAssistant) copy.samplingAssistant.clone();
+ }
+ return copy;
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected void newBuffer() {
+ int numberOfEmptyBuffers = this.bufferSet._getNumberOfEmptyBuffers();
+ //DoubleBuffer[] emptyBuffers = this.bufferSet._getEmptyBuffers();
+ if (numberOfEmptyBuffers == 0) {
+ throw new IllegalStateException("Oops, no empty buffer.");
+ }
+
+ this.currentBufferToFill = this.bufferSet._getFirstEmptyBuffer();
+ if (numberOfEmptyBuffers == 1 && !this.weHadMoreThanOneEmptyBuffer) {
+ this.currentBufferToFill.level(this.bufferSet._getMinLevelOfFullOrPartialBuffers());
+ } else {
+ this.weHadMoreThanOneEmptyBuffer = true;
+ this.currentBufferToFill.level(0);
+ /*
+ for (int i=0; i<emptyBuffers.length; i++) {
+ emptyBuffers[i].level = 0;
+ }
+ */
+ }
+ //currentBufferToFill.state = DoubleBuffer.PARTIAL;
+ this.currentBufferToFill.weight(1);
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected void postCollapse(DoubleBuffer[] toCollapse) {
+ this.weHadMoreThanOneEmptyBuffer = false;
+ }
+
+ /**
+ */
+ @Override
+ protected DoubleArrayList preProcessPhis(DoubleArrayList phis) {
+ if (beta > 1.0) {
+ phis = phis.copy();
+ for (int i = phis.size(); --i >= 0;) {
+ phis.set(i, (2 * phis.get(i) + beta - 1) / (2 * beta));
+ }
+ }
+ return phis;
+ }
+
+ /**
+ * Computes the specified quantile elements over the values previously added.
+ *
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0].
+ * <tt>phis</tt> must be sorted ascending.
+ * @return the approximate quantile elements.
+ */
+ @Override
+ public DoubleArrayList quantileElements(DoubleArrayList phis) {
+ /*
+ * The KNOWN quantile finder reads off quantiles from FULL buffers only.
+ * Since there might be a partially full buffer, this method first satisfies this constraint by temporarily filling a few +infinity, -infinity elements to make up a full block.
+ * This is in full conformance with the explicit approximation guarantees.
+ *
+ * For those of you working on online apps:
+ * The approximation guarantees are given for computing quantiles AFTER N elements have been filled, not for intermediate displays.
+ * If you have one thread filling and another thread displaying concurrently, you will note that in the very beginning the infinities will dominate the display.
+ * This could confuse users, because, of course, they don't expect any infinities, even if they "disappear" after a short while.
+ * To prevent panic exclude phi's close to zero or one in the early phases of processing.
+ */
+ DoubleBuffer partial = this.bufferSet._getPartialBuffer();
+ int missingValues = 0;
+ if (partial != null) { // any auxiliary infinities needed?
+ missingValues = bufferSet.k() - partial.size();
+ if (missingValues <= 0) {
+ throw new IllegalStateException("Oops! illegal missing values.");
+ }
+
+ //log.info("adding "+missingValues+" infinity elements...");
+ this.addInfinities(missingValues, partial);
+
+ //determine beta (N + Infinity values = beta * N)
+ this.beta = (this.totalElementsFilled + missingValues) / (double) this.totalElementsFilled;
+ } else {
+ this.beta = 1.0;
+ }
+
+ DoubleArrayList quantileElements = super.quantileElements(phis);
+
+ // restore state we were in before.
+ // remove the temporarily added infinities.
+ if (partial != null) {
+ removeInfinitiesFrom(missingValues, partial);
+ }
+
+ // now you can continue filling the remaining values, if any.
+ return quantileElements;
+ }
+
+ /**
+ * Reading off quantiles requires to fill some +infinity, -infinity values to make a partial buffer become full.
+ *
+ * This method removes the infinities which were previously temporarily added to a partial buffer. Removing them is
+ * necessary if we want to continue filling more elements. Precondition: the buffer is sorted ascending.
+ *
+ * @param infinities the number of infinities previously filled.
+ * @param buffer the buffer into which the infinities were filled.
+ */
+ protected void removeInfinitiesFrom(int infinities, DoubleBuffer buffer) {
+ int plusInf = 0;
+ int minusInf = 0;
+ // count them (this is not very clever but it's safe)
+ boolean even = true;
+ for (int i = 0; i < infinities; i++) {
+ if (even) {
+ plusInf++;
+ } else {
+ minusInf++;
+ }
+ even = !even;
+ }
+
+ buffer.values.removeFromTo(buffer.size() - plusInf, buffer.size() - 1);
+ buffer.values.removeFromTo(0, minusInf - 1);
+ //this.totalElementsFilled -= infinities;
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected boolean sampleNextElement() {
+ if (samplingAssistant == null) {
+ return true;
+ }
+
+ /*
+ * This is a KNOWN N quantile finder!
+ * One should not try to fill more than N elements,
+ * because otherwise we can't give explicit approximation guarantees anymore.
+ * Use an UNKNOWN quantile finder instead if your app may fill more than N elements.
+ *
+ * However, to make this class meaningful even under wired use cases, we actually do allow to fill more than N elements (without explicit approx. guarantees, of course).
+ * Normally, elements beyond N will not get sampled because the sampler is exhausted.
+ * Therefore the histogram will no more change no matter how much you fill.
+ * This might not be what the user expects.
+ * Therefore we use a new (unexhausted) sampler with the same parametrization.
+ *
+ * If you want this class to ignore any elements beyong N, then comment the following line.
+ */
+ //if ((totalElementsFilled-1) % N == 0) setSamplingRate(samplingRate, N); // delete if appropriate
+
+ return samplingAssistant.sampleNextElement();
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,494 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
+is hereby granted without fee, provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear in supporting documentation.
+CERN makes no representations about the suitability of this software for any purpose.
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Computes b and k vor various parameters. */
+class QuantileCalc {
+
+ private static final Logger log = LoggerFactory.getLogger(QuantileCalc.class);
+
+ private QuantileCalc() {
+ }
+
+ /**
+ * Efficiently computes the binomial coefficient, often also referred to as "n over k" or "n choose k". The binomial
+ * coefficient is defined as n!/((n-k)!*k!). Tries to avoid numeric overflows.
+ *
+ * @return the binomial coefficient.
+ */
+ public static double binomial(long n, long k) {
+ if (k == 0 || k == n) {
+ return 1.0;
+ }
+
+ // since binomial(n,k)==binomial(n,n-k), we can enforce the faster variant,
+ // which is also the variant minimizing number overflows.
+ if (k > n / 2.0) {
+ k = n - k;
+ }
+
+ double binomial = 1.0;
+ long N = n - k + 1;
+ for (long i = k; i > 0;) {
+ binomial *= ((double) N++) / (double) (i--);
+ }
+ return binomial;
+ }
+
+ /**
+ * Returns the smallest <code>long >= value</code>. <dt>Examples: <code>1.0 -> 1, 1.2 -> 2, 1.9 -> 2</code>. This
+ * method is safer than using (long) Math.ceil(value), because of possible rounding error.
+ */
+ public static long ceiling(double value) {
+ return Math.round(Math.ceil(value));
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability.
+ *
+ * Assumes that quantiles are to be computed over N values. The required sampling rate is computed and stored in the
+ * first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ *
+ * @param N the number of values over which quantiles shall be computed (e.g <tt>10^6</tt>).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+ * (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g.
+ * <tt>0.0001</tt>) (<tt>0 <= delta <= 1</tt>). To avoid probabilistic answers, set
+ * <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>).
+ * If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+ public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles,
+ double[] returnSamplingRate) {
+ if (delta > 0.0) {
+ return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+ }
+ returnSamplingRate[0] = 1.0;
+ return known_N_compute_B_and_K_quick(N, epsilon);
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with a
+ * <b>guaranteed</b> approximation error no more than epsilon. Assumes that quantiles are to be computed over N
+ * values.
+ *
+ * @param N the anticipated number of values over which quantiles shall be determined.
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer.
+ */
+ protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
+ if (epsilon <= 0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0] = 1;
+ result[1] = N;
+ return result;
+ }
+
+ int maxBuffers = 50;
+ int maxHeight = 50;
+ double N_double = (double) N;
+ double c = N_double * epsilon * 2.0;
+ int[] heightMaximums = new int[maxBuffers - 1];
+
+ // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+ // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+ for (int b = 2; b <= maxBuffers; b++) {
+ int h = 3;
+
+ while (h <= maxHeight && // skip heights until x<=0
+ (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+ ((double) Math.round(binomial(b + h - 3, h - 3))) +
+ ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+ > 0.0
+ ) {
+ h++;
+ }
+ //from now on x is monotonically growing...
+ while (h <= maxHeight && // skip heights until x>0
+ (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+ ((double) Math.round(binomial(b + h - 3, h - 3))) +
+ ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+ <= 0.0
+ ) {
+ h++;
+ }
+ h--; //go back to last height
+
+ // was x>0 or did we loop without finding anything?
+ int hMax;
+ if (h >= maxHeight &&
+ (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+ ((double) Math.round(binomial(b + h - 3, h - 3))) +
+ ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+ > 0.0) {
+ hMax = Integer.MIN_VALUE;
+ } else {
+ hMax = h;
+ }
+
+ heightMaximums[b - 2] = hMax; //safe some space
+ } //end for
+
+
+ // for each b, determine the smallest k satisfying the constraints, i.e.
+ // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+ long[] kMinimums = new long[maxBuffers - 1];
+ for (int b = 2; b <= maxBuffers; b++) {
+ int h = heightMaximums[b - 2];
+ long kMin = Long.MAX_VALUE;
+ if (h > Integer.MIN_VALUE) {
+ double value = ((double) Math.round(binomial(b + h - 2, h - 1)));
+ long tmpK = ceiling(N_double / value);
+ if (tmpK <= kMin) {
+ kMin = tmpK;
+ }
+ }
+ kMinimums[b - 2] = kMin;
+ }
+
+ // from all b's, determine b that minimizes b*kMin
+ long multMin = Long.MAX_VALUE;
+ int minB = -1;
+ for (int b = 2; b <= maxBuffers; b++) {
+ if (kMinimums[b - 2] < Long.MAX_VALUE) {
+ long mult = ((long) b) * kMinimums[b - 2];
+ if (mult < multMin) {
+ multMin = mult;
+ minB = b;
+ }
+ }
+ }
+
+ long b, k;
+ if (minB != -1) { // epsilon large enough?
+ b = minB;
+ k = kMinimums[minB - 2];
+ } else { // epsilon is very small or zero.
+ b = 1; // the only possible solution without violating the
+ k = N; // approximation guarantees is exact quantile search.
+ }
+
+ long[] result = new long[2];
+ result[0] = b;
+ result[1] = k;
+ return result;
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability. Assumes that quantiles are to be computed over
+ * N values. The required sampling rate is computed and stored in the first element of the provided
+ * <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ *
+ * @param N the anticipated number of values over which quantiles shall be computed (e.g 10^6).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+ * (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g.
+ * <tt>0.0001</tt>) (<tt>0 <= delta <= 1</tt>). To avoid probabilistic answers, set
+ * <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>).
+ * If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+ protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles,
+ double[] returnSamplingRate) {
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon <= 0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0] = 1;
+ result[1] = N;
+ returnSamplingRate[0] = 1.0;
+ return result;
+ }
+
+
+ int maxBuffers = 50;
+ int maxHeight = 50;
+ double N_double = N;
+
+ // One possibility is to use one buffer of size N
+ //
+ long ret_b = 1;
+ long ret_k = N;
+ double sampling_rate = 1.0;
+ long memory = N;
+
+
+ // Otherwise, there are at least two buffers (b >= 2)
+ // and the height of the tree is at least three (h >= 3)
+ //
+ // We restrict the search for b and h to MAX_BINOM, a large enough value for
+ // practical values of epsilon >= 0.001 and delta >= 0.00001
+ //
+ double logarithm = Math.log(2.0 * quantiles / delta);
+ double c = 2.0 * epsilon * N_double;
+ for (long b = 2; b < maxBuffers; b++) {
+ for (long h = 3; h < maxHeight; h++) {
+ double binomial = binomial(b + h - 2, h - 1);
+ long tmp = ceiling(N_double / binomial);
+ if ((b * tmp < memory) &&
+ ((h - 2) * binomial - binomial(b + h - 3, h - 3) + binomial(b + h - 3, h - 2)
+ <= c)) {
+ ret_k = tmp;
+ ret_b = b;
+ memory = ret_k * b;
+ sampling_rate = 1.0;
+ }
+ if (delta > 0.0) {
+ double t = (h - 2) * binomial(b + h - 2, h - 1) - binomial(b + h - 3, h - 3) + binomial(b + h - 3, h - 2);
+ double u = logarithm / epsilon;
+ double v = binomial(b + h - 2, h - 1);
+ double w = logarithm / (2.0 * epsilon * epsilon);
+
+ // From our SIGMOD 98 paper, we have two equantions to satisfy:
+ // t <= u * alpha/(1-alpha)^2
+ // kv >= w/(1-alpha)^2
+ //
+ // Denoting 1/(1-alpha) by x,
+ // we see that the first inequality is equivalent to
+ // t/u <= x^2 - x
+ // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+ // Plugging in this value into second equation yields
+ // k >= wx^2/v
+
+ double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0 * t / u);
+ long k = ceiling(w * x * x / v);
+ if (b * k < memory) {
+ ret_k = k;
+ ret_b = b;
+ memory = b * k;
+ sampling_rate = N_double * 2.0 * epsilon * epsilon / logarithm;
+ }
+ }
+ }
+ }
+
+ long[] result = new long[2];
+ result[0] = ret_b;
+ result[1] = ret_k;
+ returnSamplingRate[0] = sampling_rate;
+ return result;
+ }
+
+ public static void main(String[] args) {
+ test_B_and_K_Calculation(args);
+ }
+
+ /** Computes b and k for different parameters. */
+ public static void test_B_and_K_Calculation(String[] args) {
+ boolean known_N;
+ if (args == null) {
+ known_N = false;
+ } else {
+ known_N = Boolean.valueOf(args[0]);
+ }
+
+ int[] quantiles = {1, 1000};
+
+ long[] sizes = {100000, 1000000, 10000000, 1000000000};
+
+ double[] deltas = {0.0, 0.001, 0.0001, 0.00001};
+
+ double[] epsilons = {0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000001};
+
+
+ if (!known_N) {
+ sizes = new long[]{0};
+ }
+ log.info("\n\n");
+ if (known_N) {
+ log.info("Computing b's and k's for KNOWN N");
+ } else {
+ log.info("Computing b's and k's for UNKNOWN N");
+ }
+ log.info("mem [elements/1024]");
+ log.info("***********************************");
+
+ for (int p : quantiles) {
+ log.info("------------------------------");
+ log.info("computing for p = " + p);
+ for (long N : sizes) {
+ log.info(" ------------------------------");
+ log.info(" computing for N = " + N);
+ for (double delta : deltas) {
+ log.info(" ------------------------------");
+ log.info(" computing for delta = " + delta);
+ for (double epsilon : epsilons) {
+ double[] returnSamplingRate = new double[1];
+ long[] result;
+ if (known_N) {
+ result = known_N_compute_B_and_K(N, epsilon, delta, p, returnSamplingRate);
+ } else {
+ result = unknown_N_compute_B_and_K(epsilon, delta, p);
+ }
+
+ long b = result[0];
+ long k = result[1];
+ log.info(" (e,d,N,p)=(" + epsilon + ',' + delta + ',' + N + ',' + p + ") --> ");
+ log.info("(b,k,mem");
+ if (known_N) {
+ log.info(",sampling");
+ }
+ log.info(")=(" + b + ',' + k + ',' + (b * k / 1024));
+ if (known_N) {
+ log.info("," + returnSamplingRate[0]);
+ }
+ log.info(")");
+ }
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability.
+ *
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact results, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+ * (<tt>0 <= delta <= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>). If
+ * unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @return <tt>long[3]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>long[2]</tt>=the tree height where sampling shall start.
+ */
+ public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon <= 0.0 || delta <= 0.0) {
+ // no way around exact quantile search
+ long[] result = new long[3];
+ result[0] = 1;
+ result[1] = Long.MAX_VALUE;
+ result[2] = Long.MAX_VALUE;
+ return result;
+ }
+
+ int max_b = 50;
+ int max_h = 50;
+ int max_H = 50;
+ int max_Iterations = 2;
+
+ long best_b = Long.MAX_VALUE;
+ long best_k = Long.MAX_VALUE;
+ long best_h = Long.MAX_VALUE;
+ long best_memory = Long.MAX_VALUE;
+
+ double pow = Math.pow(2.0, max_H);
+ double logDelta = Math.log(2.0 / (delta / quantiles)) / (2.0 * epsilon * epsilon);
+ //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+ while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+ // identify that combination of b and h that minimizes b*k.
+ // exhaustive search.
+ for (int b = 2; b <= max_b; b++) {
+ for (int h = 2; h <= max_h; h++) {
+ double Ld = binomial(b + h - 2, h - 1);
+ double Ls = binomial(b + h - 3, h - 1);
+
+ // now we have k>=c*(1-alpha)^-2.
+ // let's compute c.
+ //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+ double c = logDelta / Math.min(Ld, 8.0 * Ls / 3.0);
+
+ // now we have k>=d/alpha.
+ // let's compute d.
+ double beta = Ld / Ls;
+ double cc = (beta - 2.0) * (max_H - 2.0) / (beta + pow - 2.0);
+ double d = (h + 3 + cc) / (2.0 * epsilon);
+
+ /*
+ double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
+ d = (d + 2.0) / (2.0*epsilon);
+ */
+
+ // now we have c*(1-alpha)^-2 == d/alpha.
+ // we solve this equation for alpha yielding two solutions
+ // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
+ double f = c * c + 4.0 * c * d;
+ if (f < 0.0) {
+ continue;
+ } // non real solution to equation
+ double root = Math.sqrt(f);
+ double alpha_one = (c + 2.0 * d + root) / (2.0 * d);
+ double alpha_two = (c + 2.0 * d - root) / (2.0 * d);
+
+ // any alpha must satisfy 0<alpha<1 to yield valid solutions
+ boolean alpha_one_OK = false;
+ if (0.0 < alpha_one && alpha_one < 1.0) {
+ alpha_one_OK = true;
+ }
+ boolean alpha_two_OK = false;
+ if (0.0 < alpha_two && alpha_two < 1.0) {
+ alpha_two_OK = true;
+ }
+ if (alpha_one_OK || alpha_two_OK) {
+ double alpha = alpha_one;
+ if (alpha_one_OK && alpha_two_OK) {
+ // take the alpha that minimizes d/alpha
+ alpha = Math.max(alpha_one, alpha_two);
+ } else if (alpha_two_OK) {
+ alpha = alpha_two;
+ }
+
+ // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+ long k = ceiling(Math.max(d / alpha, (h + 1) / (2.0 * epsilon)));
+ if (k > 0) { // valid solution?
+ long memory = b * k;
+ if (memory < best_memory) {
+ // found a solution requiring less memory
+ best_k = k;
+ best_b = b;
+ best_h = h;
+ best_memory = memory;
+ }
+ }
+ }
+ } //end for h
+ } //end for b
+
+ if (best_b == Long.MAX_VALUE) {
+ log.info("Warning: Computing b and k looks like a lot of work!");
+ // no solution found so far. very unlikely. Anyway, try again.
+ max_b *= 2;
+ max_h *= 2;
+ max_H *= 2;
+ }
+ } //end while
+
+ long[] result = new long[3];
+ if (best_b == Long.MAX_VALUE) {
+ // no solution found.
+ // no way around exact quantile search.
+ result[0] = 1;
+ result[1] = Long.MAX_VALUE;
+ result[2] = Long.MAX_VALUE;
+ } else {
+ result[0] = best_b;
+ result[1] = best_k;
+ result[2] = best_h;
+ }
+
+ return result;
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,648 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
+is hereby granted without fee, provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear in supporting documentation.
+CERN makes no representations about the suitability of this software for any purpose.
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+//import cern.it.util.Doubles;
+
+import org.apache.mahout.math.jet.math.Arithmetic;
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * Factory constructing exact and approximate quantile finders for both known and unknown <tt>N</tt>.
+ * Also see {@link hep.aida.bin.QuantileBin1D}, demonstrating how this package can be used.
+ *
+ * The approx. algorithms compute approximate quantiles of large data sequences in a single pass.
+ * The approximation guarantees are explicit, and apply for arbitrary value distributions and arrival distributions of the data sequence.
+ * The main memory requirements are smaller than for any other known technique by an order of magnitude.
+ *
+ * <p>The approx. algorithms are primarily intended to help applications scale.
+ * When faced with a large data sequences, traditional methods either need very large memories or time consuming disk based sorting.
+ * In constrast, the approx. algorithms can deal with > 10^10 values without disk based sorting.
+ *
+ * <p>All classes can be seen from various angles, for example as
+ * <dt>1. Algorithm to compute quantiles.
+ * <dt>2. 1-dim-equi-depth histogram.
+ * <dt>3. 1-dim-histogram arbitrarily rebinnable in real-time.
+ * <dt>4. A space efficient MultiSet data structure using lossy compression.
+ * <dt>5. A space efficient value preserving bin of a 2-dim or d-dim histogram.
+ * <dt>(All subject to an accuracy specified by the user.)
+ *
+ * <p>Use methods <tt>newXXX(...)</tt> to get new instances of one of the following quantile finders.
+ *
+ * <p><b>1. Exact quantile finding algorithm for known and unknown <tt>N</tt> requiring large main memory.</b></p>
+ * The folkore algorithm: Keeps all elements in main memory, sorts the list, then picks the quantiles.
+ *
+ *
+ *
+ *
+ * <p><p><b>2. Approximate quantile finding algorithm for known <tt>N</tt> requiring only one pass and little main memory.</b></p>
+ *
+ * <p>Needs as input the following parameters:<p>
+ * <dt>1. <tt>N</tt> - the number of values of the data sequence over which quantiles are to be determined.
+ * <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * <dt>3. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory requirements.
+ * With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>4. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay,
+ * Approximate Medians and other Quantiles in One Pass and with Limited Memory,
+ * Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data,
+ * Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku/papers/quantiles.ps.gz"> here</A>.
+ *
+ *
+ *
+ *
+ * <p><p><b>3. Approximate quantile finding algorithm for unknown <tt>N</tt> requiring only one pass and little main memory.</b></p>
+ * This algorithm requires at most two times the memory of a corresponding approx. quantile finder knowing <tt>N</tt>.
+ *
+ * <p>Needs as input the following parameters:<p>
+ * <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. If unknown in advance, set this number large, e.g. <tt>quantiles >= 1000</tt>.
+ * <dt>2. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory requirements.
+ * With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>3. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay,
+ * Random Sampling Techniques for Space Efficient Online Computation of Order Statistics of Large Datasets.
+ * Proc. of the 1999 ACM SIGMOD Int. Conf. on Management of Data,
+ * Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku/papers/unknown.ps.gz"> here</A>.
+ *
+ * <p><b>Example usage:</b>
+ *
+ *<pre>
+ * _TODO_
+ *</pre><p>
+ *
+ *
+ * @see KnownDoubleQuantileEstimator
+ * @see UnknownDoubleQuantileEstimator
+ */
+
+/** @deprecated until unit tests are in place. Until this time, this class/interface is unsupported. */
+@Deprecated
+public class QuantileFinderFactory {
+
+ private static final Logger log = LoggerFactory.getLogger(QuantileFinderFactory.class);
+
+ /** Make this class non instantiable. Let still allow others to inherit. */
+ private QuantileFinderFactory() {
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability.
+ *
+ * Assumes that quantiles are to be computed over N values. The required sampling rate is computed and stored in the
+ * first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ *
+ * @param N the number of values over which quantiles shall be computed (e.g <tt>10^6</tt>).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+ * (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g.
+ * <tt>0.0001</tt>) (<tt>0 <= delta <= 1</tt>). To avoid probabilistic answers, set
+ * <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>).
+ * If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @param returnSamplingRate output parameter, a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+ public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles,
+ double[] returnSamplingRate) {
+ returnSamplingRate[0] = 1.0;
+ if (epsilon <= 0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0] = 1;
+ result[1] = N;
+ return result;
+ }
+ if (epsilon >= 1.0 || delta >= 1.0) {
+ // can make any error we wish
+ long[] result = new long[2];
+ result[0] = 2;
+ result[1] = 1;
+ return result;
+ }
+
+ if (delta > 0.0) {
+ return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+ }
+ return known_N_compute_B_and_K_quick(N, epsilon);
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with a
+ * <b>guaranteed</b> approximation error no more than epsilon. Assumes that quantiles are to be computed over N
+ * values.
+ *
+ * @param N the anticipated number of values over which quantiles shall be determined.
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer.
+ */
+ protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
+ int maxBuffers = 50;
+ int maxHeight = 50;
+ double N_double = (double) N;
+ double c = N_double * epsilon * 2.0;
+ int[] heightMaximums = new int[maxBuffers - 1];
+
+ // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+ // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+ for (int b = 2; b <= maxBuffers; b++) {
+ int h = 3;
+
+ while (h <= maxHeight && // skip heights until x<=0
+ (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+ (Arithmetic.binomial(b + h - 3, h - 3)) +
+ (Arithmetic.binomial(b + h - 3, h - 2)) - c
+ > 0.0
+ ) {
+ h++;
+ }
+ //from now on x is monotonically growing...
+ while (h <= maxHeight && // skip heights until x>0
+ (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+ (Arithmetic.binomial(b + h - 3, h - 3)) +
+ (Arithmetic.binomial(b + h - 3, h - 2)) - c
+ <= 0.0
+ ) {
+ h++;
+ }
+ h--; //go back to last height
+
+ // was x>0 or did we loop without finding anything?
+ int hMax;
+ if (h >= maxHeight &&
+ (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+ (Arithmetic.binomial(b + h - 3, h - 3)) +
+ (Arithmetic.binomial(b + h - 3, h - 2)) - c
+ > 0.0) {
+ hMax = Integer.MIN_VALUE;
+ } else {
+ hMax = h;
+ }
+
+ heightMaximums[b - 2] = hMax; //safe some space
+ } //end for
+
+
+ // for each b, determine the smallest k satisfying the constraints, i.e.
+ // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+ long[] kMinimums = new long[maxBuffers - 1];
+ for (int b = 2; b <= maxBuffers; b++) {
+ int h = heightMaximums[b - 2];
+ long kMin = Long.MAX_VALUE;
+ if (h > Integer.MIN_VALUE) {
+ double value = (Arithmetic.binomial(b + h - 2, h - 1));
+ long tmpK = (long) (Math.ceil(N_double / value));
+ if (tmpK <= kMin) {
+ kMin = tmpK;
+ }
+ }
+ kMinimums[b - 2] = kMin;
+ }
+
+ // from all b's, determine b that minimizes b*kMin
+ long multMin = Long.MAX_VALUE;
+ int minB = -1;
+ for (int b = 2; b <= maxBuffers; b++) {
+ if (kMinimums[b - 2] < Long.MAX_VALUE) {
+ long mult = ((long) b) * kMinimums[b - 2];
+ if (mult < multMin) {
+ multMin = mult;
+ minB = b;
+ }
+ }
+ }
+
+ long b, k;
+ if (minB != -1) { // epsilon large enough?
+ b = minB;
+ k = kMinimums[minB - 2];
+ } else { // epsilon is very small or zero.
+ b = 1; // the only possible solution without violating the
+ k = N; // approximation guarantees is exact quantile search.
+ }
+
+ long[] result = new long[2];
+ result[0] = b;
+ result[1] = k;
+ return result;
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability. Assumes that quantiles are to be computed over
+ * N values. The required sampling rate is computed and stored in the first element of the provided
+ * <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ *
+ * @param N the anticipated number of values over which quantiles shall be computed (e.g 10^6).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+ * (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g.
+ * <tt>0.0001</tt>) (<tt>0 <= delta <= 1</tt>). To avoid probabilistic answers, set
+ * <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>).
+ * If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+ protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles,
+ double[] returnSamplingRate) {
+ int maxBuffers = 50;
+ int maxHeight = 50;
+ double N_double = N;
+
+ // One possibility is to use one buffer of size N
+ //
+ long ret_b = 1;
+ long ret_k = N;
+ double sampling_rate = 1.0;
+ long memory = N;
+
+
+ // Otherwise, there are at least two buffers (b >= 2)
+ // and the height of the tree is at least three (h >= 3)
+ //
+ // We restrict the search for b and h to MAX_BINOM, a large enough value for
+ // practical values of epsilon >= 0.001 and delta >= 0.00001
+ //
+ double logarithm = Math.log(2.0 * quantiles / delta);
+ double c = 2.0 * epsilon * N_double;
+ for (long b = 2; b < maxBuffers; b++) {
+ for (long h = 3; h < maxHeight; h++) {
+ double binomial = Arithmetic.binomial(b + h - 2, h - 1);
+ long tmp = (long) Math.ceil(N_double / binomial);
+ if ((b * tmp < memory) &&
+ ((h - 2) * binomial - Arithmetic.binomial(b + h - 3, h - 3) + Arithmetic.binomial(b + h - 3, h - 2)
+ <= c)) {
+ ret_k = tmp;
+ ret_b = b;
+ memory = ret_k * b;
+ sampling_rate = 1.0;
+ }
+ if (delta > 0.0) {
+ double t = (h - 2) * Arithmetic.binomial(b + h - 2, h - 1) - Arithmetic.binomial(b + h - 3, h - 3) +
+ Arithmetic.binomial(b + h - 3, h - 2);
+ double u = logarithm / epsilon;
+ double v = Arithmetic.binomial(b + h - 2, h - 1);
+ double w = logarithm / (2.0 * epsilon * epsilon);
+
+ // From our SIGMOD 98 paper, we have two equantions to satisfy:
+ // t <= u * alpha/(1-alpha)^2
+ // kv >= w/(1-alpha)^2
+ //
+ // Denoting 1/(1-alpha) by x,
+ // we see that the first inequality is equivalent to
+ // t/u <= x^2 - x
+ // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+ // Plugging in this value into second equation yields
+ // k >= wx^2/v
+
+ double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0 * t / u);
+ long k = (long) Math.ceil(w * x * x / v);
+ if (b * k < memory) {
+ ret_k = k;
+ ret_b = b;
+ memory = b * k;
+ sampling_rate = N_double * 2.0 * epsilon * epsilon / logarithm;
+ }
+ }
+ }
+ }
+
+ long[] result = new long[2];
+ result[0] = ret_b;
+ result[1] = ret_k;
+ returnSamplingRate[0] = sampling_rate;
+ return result;
+ }
+
+ /**
+ * Returns a quantile finder that minimizes the amount of memory needed under the user provided constraints.
+ *
+ * Many applications don't know in advance over how many elements quantiles are to be computed. However, some of them
+ * can give an upper limit, which will assist the factory in choosing quantile finders with minimal memory
+ * requirements. For example if you select values from a database and fill them into histograms, then you probably
+ * don't know how many values you will fill, but you probably do know that you will fill at most <tt>S</tt> elements,
+ * the size of your database.
+ *
+ * @param known_N specifies whether the number of elements over which quantiles are to be computed is known or not.
+ * @param N if <tt>known_N==true</tt>, the number of elements over which quantiles are to be computed. if
+ * <tt>known_N==false</tt>, the upper limit on the number of elements over which quantiles are to be
+ * computed. If such an upper limit is a-priori unknown, then set <tt>N = Long.MAX_VALUE</tt>.
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. 0.0001) (0 <=
+ * delta <= 1). To avoid probabilistic answers, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>). If
+ * unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @param generator a uniform random number generator. Set this parameter to <tt>null</tt> to use a default
+ * generator.
+ * @return the quantile finder minimizing memory requirements under the given constraints.
+ */
+ public static DoubleQuantileFinder newDoubleQuantileFinder(boolean known_N, long N, double epsilon, double delta,
+ int quantiles, RandomEngine generator) {
+ //boolean known_N = true;
+ //if (N==Long.MAX_VALUE) known_N = false;
+ // check parameters.
+ // if they are illegal, keep quite and return an exact finder.
+ if (epsilon <= 0.0 || N < 1000) {
+ return new ExactDoubleQuantileFinder();
+ }
+ if (epsilon > 1) {
+ epsilon = 1;
+ }
+ if (delta < 0) {
+ delta = 0;
+ }
+ if (delta > 1) {
+ delta = 1;
+ }
+ if (quantiles < 1) {
+ quantiles = 1;
+ }
+ if (quantiles > N) {
+ N = quantiles;
+ }
+
+ //KnownDoubleQuantileEstimator finder;
+ if (known_N) {
+ double[] samplingRate = new double[1];
+ long[] resultKnown = known_N_compute_B_and_K(N, epsilon, delta, quantiles, samplingRate);
+ long b = resultKnown[0];
+ long k = resultKnown[1];
+ if (b == 1) {
+ return new ExactDoubleQuantileFinder();
+ }
+ return new KnownDoubleQuantileEstimator((int) b, (int) k, N, samplingRate[0], generator);
+ } else {
+ long[] resultUnknown = unknown_N_compute_B_and_K(epsilon, delta, quantiles);
+ long b1 = resultUnknown[0];
+ long k1 = resultUnknown[1];
+ long h1 = resultUnknown[2];
+ double preComputeEpsilon = -1.0;
+ if (resultUnknown[3] == 1) {
+ preComputeEpsilon = epsilon;
+ }
+
+ //if (N==Long.MAX_VALUE) { // no maximum N provided by user.
+
+ // if (true) fixes bug reported by LarryPeranich@fairisaac.com
+ //if (true) { // no maximum N provided by user.
+ if (b1 == 1) {
+ return new ExactDoubleQuantileFinder();
+ }
+ return new UnknownDoubleQuantileEstimator((int) b1, (int) k1, (int) h1, preComputeEpsilon, generator);
+ //}
+
+ /*
+ // determine whether UnknownFinder or KnownFinder with maximum N requires less memory.
+ double[] samplingRate = new double[1];
+
+ // IMPORTANT: for known finder, switch sampling off (delta == 0) !!!
+ // with knownN-sampling we can only guarantee the errors if the input sequence has EXACTLY N elements.
+ // with knownN-no sampling we can also guarantee the errors for sequences SMALLER than N elements.
+ long[] resultKnown = known_N_compute_B_and_K(N, epsilon, 0, quantiles, samplingRate);
+
+ long b2 = resultKnown[0];
+ long k2 = resultKnown[1];
+
+ if (b2 * k2 < b1 * k1) { // the KnownFinder is smaller
+ if (b2 == 1) return new ExactDoubleQuantileFinder();
+ return new KnownDoubleQuantileEstimator((int) b2, (int) k2, N, samplingRate[0], generator);
+ }
+
+ // the UnknownFinder is smaller
+ if (b1 == 1) return new ExactDoubleQuantileFinder();
+ return new UnknownDoubleQuantileEstimator((int) b1, (int) k1, (int) h1, preComputeEpsilon, generator);
+ */
+ }
+ }
+
+ /**
+ * Convenience method that computes phi's for equi-depth histograms. This is simply a list of numbers with <tt>i /
+ * (double)quantiles</tt> for <tt>i={1,2,...,quantiles-1}</tt>.
+ *
+ * @return the equi-depth phi's
+ */
+ public static org.apache.mahout.math.list.DoubleArrayList newEquiDepthPhis(int quantiles) {
+ DoubleArrayList phis =
+ new DoubleArrayList(quantiles - 1);
+ for (int i = 1; i <= quantiles - 1; i++) {
+ phis.add(i / (double) quantiles);
+ }
+ return phis;
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability.
+ *
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact results, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+ * (<tt>0 <= delta <= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>). If
+ * unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is
+ * better, otherwise 0;
+ */
+ public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
+ return unknown_N_compute_B_and_K_raw(epsilon, delta, quantiles);
+ // move stuff from _raw(..) here and delete _raw(...)
+
+ /*
+ long[] result_1 = unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
+ long b1 = result_1[0];
+ long k1 = result_1[1];
+
+
+ int quantilesToPrecompute = (int) Doubles.ceiling(1.0 / epsilon);
+
+ if (quantiles>quantilesToPrecompute) {
+ // try if precomputing quantiles requires less memory.
+ long[] result_2 = unknown_N_compute_B_and_K_raw(epsilon/2.0,delta,quantilesToPrecompute);
+
+ long b2 = result_2[0];
+ long k2 = result_2[1];
+ if (b2*k2 < b1*k1) {
+ result_2[3] = 1; //precomputation is better
+ result_1 = result_2;
+ }
+ }
+ return result_1;
+ */
+ }
+
+ /**
+ * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+ * approximation error no more than epsilon with a certain probability. <b>You never need to call this method.</b> It
+ * is only for curious users wanting to gain some insight into the workings of the algorithms.
+ *
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <=
+ * epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+ * (<tt>0 <= delta <= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>). If
+ * unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
+ * @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+ * buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is
+ * better, otherwise 0;
+ */
+ protected static long[] unknown_N_compute_B_and_K_raw(double epsilon, double delta, int quantiles) {
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon <= 0.0) {
+ long[] result = new long[4];
+ result[0] = 1;
+ result[1] = Long.MAX_VALUE;
+ result[2] = Long.MAX_VALUE;
+ result[3] = 0;
+ return result;
+ }
+ if (epsilon >= 1.0 || delta >= 1.0) {
+ // can make any error we wish
+ long[] result = new long[4];
+ result[0] = 2;
+ result[1] = 1;
+ result[2] = 3;
+ result[3] = 0;
+ return result;
+ }
+ if (delta <= 0.0) {
+ // no way around exact quantile search
+ long[] result = new long[4];
+ result[0] = 1;
+ result[1] = Long.MAX_VALUE;
+ result[2] = Long.MAX_VALUE;
+ result[3] = 0;
+ return result;
+ }
+
+ int max_b = 50;
+ int max_h = 50;
+ int max_H = 50;
+ int max_Iterations = 2;
+
+ long best_b = Long.MAX_VALUE;
+ long best_k = Long.MAX_VALUE;
+ long best_h = Long.MAX_VALUE;
+ long best_memory = Long.MAX_VALUE;
+
+ double pow = Math.pow(2.0, max_H);
+ double logDelta = Math.log(2.0 / (delta / quantiles)) / (2.0 * epsilon * epsilon);
+ //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+ while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+ // identify that combination of b and h that minimizes b*k.
+ // exhaustive search.
+ for (int b = 2; b <= max_b; b++) {
+ for (int h = 2; h <= max_h; h++) {
+ double Ld = Arithmetic.binomial(b + h - 2, h - 1);
+ double Ls = Arithmetic.binomial(b + h - 3, h - 1);
+
+ // now we have k>=c*(1-alpha)^-2.
+ // let's compute c.
+ //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+ double c = logDelta / Math.min(Ld, 8.0 * Ls / 3.0);
+
+ // now we have k>=d/alpha.
+ // let's compute d.
+ double beta = Ld / Ls;
+ double cc = (beta - 2.0) * (max_H - 2.0) / (beta + pow - 2.0);
+ double d = (h + 3 + cc) / (2.0 * epsilon);
+
+ /*
+ double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
+ d = (d + 2.0) / (2.0*epsilon);
+ */
+
+ // now we have c*(1-alpha)^-2 == d/alpha.
+ // we solve this equation for alpha yielding two solutions
+ // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
+ double f = c * c + 4.0 * c * d;
+ if (f < 0.0) {
+ continue;
+ } // non real solution to equation
+ double root = Math.sqrt(f);
+ double alpha_one = (c + 2.0 * d + root) / (2.0 * d);
+ double alpha_two = (c + 2.0 * d - root) / (2.0 * d);
+
+ // any alpha must satisfy 0<alpha<1 to yield valid solutions
+ boolean alpha_one_OK = false;
+ if (0.0 < alpha_one && alpha_one < 1.0) {
+ alpha_one_OK = true;
+ }
+ boolean alpha_two_OK = false;
+ if (0.0 < alpha_two && alpha_two < 1.0) {
+ alpha_two_OK = true;
+ }
+ if (alpha_one_OK || alpha_two_OK) {
+ double alpha = alpha_one;
+ if (alpha_one_OK && alpha_two_OK) {
+ // take the alpha that minimizes d/alpha
+ alpha = Math.max(alpha_one, alpha_two);
+ } else if (alpha_two_OK) {
+ alpha = alpha_two;
+ }
+
+ // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+ long k = (long) Math.ceil(Math.max(d / alpha, (h + 1) / (2.0 * epsilon)));
+ if (k > 0) { // valid solution?
+ long memory = b * k;
+ if (memory < best_memory) {
+ // found a solution requiring less memory
+ best_k = k;
+ best_b = b;
+ best_h = h;
+ best_memory = memory;
+ }
+ }
+ }
+ } //end for h
+ } //end for b
+
+ if (best_b == Long.MAX_VALUE) {
+ log.warn("Computing b and k looks like a lot of work!");
+ // no solution found so far. very unlikely. Anyway, try again.
+ max_b *= 2;
+ max_h *= 2;
+ max_H *= 2;
+ }
+ } //end while
+
+ long[] result = new long[4];
+ result[3] = 0;
+ if (best_b == Long.MAX_VALUE) {
+ // no solution found.
+ // no way around exact quantile search.
+ result[0] = 1;
+ result[1] = Long.MAX_VALUE;
+ result[2] = Long.MAX_VALUE;
+ } else {
+ result[0] = best_b;
+ result[1] = best_k;
+ result[2] = best_h;
+ }
+
+ return result;
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,188 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
+is hereby granted without fee, provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear in supporting documentation.
+CERN makes no representations about the suitability of this software for any purpose.
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.jet.random.sampling.WeightedRandomSampler;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.apache.mahout.math.list.ObjectArrayList;
+
+import java.util.Comparator;
+
+/**
+ * Approximate quantile finding algorithm for unknown <tt>N</tt> requiring only one pass and little main memory;
+ * computes quantiles over a sequence of <tt>double</tt> elements. This algorithm requires at most two times the memory
+ * of a corresponding approx. quantile finder knowing <tt>N</tt>.
+ *
+ * <p>Needs as input the following parameters:<p> <dt>1. <tt>quantiles</tt> - the number of quantiles to be computed.
+ * <dt>2. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm
+ * is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory
+ * requirements. With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with
+ * respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>3. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set
+ * <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * You usually don't instantiate quantile finders by using the constructor. Instead use the factory
+ * <tt>QuantileFinderFactor</tt> to do so. It will set up the right parametrization for you.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, Random Sampling Techniques for Space
+ * Efficient Online Computation of Order Statistics of Large Datasets. Accepted for Proc. of the 1999 ACM SIGMOD Int.
+ * Conf. on Management of Data, Paper (soon) available <A HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
+ *
+ * @see QuantileFinderFactory
+ */
+class UnknownDoubleQuantileEstimator extends DoubleQuantileEstimator {
+
+ private int currentTreeHeight;
+ private final int treeHeightStartingSampling;
+ private WeightedRandomSampler sampler;
+ private final double precomputeEpsilon;
+
+ /**
+ * Constructs an approximate quantile finder with b buffers, each having k elements.
+ *
+ * @param b the number of buffers
+ * @param k the number of elements per buffer
+ * @param h the tree height at which sampling shall start.
+ * @param precomputeEpsilon the epsilon for which quantiles shall be precomputed; set this value <=0.0 if nothing
+ * shall be precomputed.
+ * @param generator a uniform random number generator.
+ */
+ UnknownDoubleQuantileEstimator(int b, int k, int h, double precomputeEpsilon, RandomEngine generator) {
+ this.sampler = new WeightedRandomSampler(1, generator);
+ setUp(b, k);
+ this.treeHeightStartingSampling = h;
+ this.precomputeEpsilon = precomputeEpsilon;
+ this.clear();
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected DoubleBuffer[] buffersToCollapse() {
+ DoubleBuffer[] fullBuffers = bufferSet._getFullOrPartialBuffers();
+
+ sortAscendingByLevel(fullBuffers);
+
+ // if there is only one buffer at the lowest level, then increase its level so that there are at least two at the lowest level.
+ int minLevel = fullBuffers[1].level();
+ if (fullBuffers[0].level() < minLevel) {
+ fullBuffers[0].level(minLevel);
+ }
+
+ return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+ }
+
+ /**
+ * Removes all elements from the receiver. The receiver will be empty after this call returns, and its memory
+ * requirements will be close to zero.
+ */
+ @Override
+ public synchronized void clear() {
+ super.clear();
+ this.currentTreeHeight = 1;
+ this.sampler.setWeight(1);
+ }
+
+ /**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+ @Override
+ public Object clone() {
+ UnknownDoubleQuantileEstimator copy = (UnknownDoubleQuantileEstimator) super.clone();
+ if (this.sampler != null) {
+ copy.sampler = (WeightedRandomSampler) copy.sampler.clone();
+ }
+ return copy;
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected void newBuffer() {
+ currentBufferToFill = bufferSet._getFirstEmptyBuffer();
+ if (currentBufferToFill == null) {
+ throw new IllegalStateException("Oops, no empty buffer.");
+ }
+
+ currentBufferToFill.level(currentTreeHeight - 1);
+ currentBufferToFill.weight(sampler.getWeight());
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected void postCollapse(DoubleBuffer[] toCollapse) {
+ if (toCollapse.length == bufferSet.b()) { //delta for unknown finder
+ currentTreeHeight++;
+ if (currentTreeHeight >= treeHeightStartingSampling) {
+ sampler.setWeight(sampler.getWeight() * 2);
+ }
+ }
+ }
+
+ /**
+ * Computes the specified quantile elements over the values previously added.
+ *
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval (0.0,1.0].
+ * <tt>phis</tt> must be sorted ascending.
+ * @return the approximate quantile elements.
+ */
+ @Override
+ public DoubleArrayList quantileElements(DoubleArrayList phis) {
+ if (precomputeEpsilon <= 0.0) {
+ return super.quantileElements(phis);
+ }
+
+ int quantilesToPrecompute = (int) Utils.epsilonCeiling(1.0 / precomputeEpsilon);
+
+ //select that quantile from the precomputed set that corresponds to a position closest to phi.
+ phis = phis.copy();
+ double e = precomputeEpsilon;
+ for (int index = phis.size(); --index >= 0;) {
+ double phi = phis.get(index);
+ int i = (int) Math.round(((2.0 * phi / e) - 1.0) / 2.0); // finds closest
+ i = Math.min(quantilesToPrecompute - 1, Math.max(0, i));
+ double augmentedPhi = (e / 2.0) * (1 + 2 * i);
+ phis.set(index, augmentedPhi);
+ }
+
+ return super.quantileElements(phis);
+ }
+
+ /** Not yet commented. */
+ @Override
+ protected boolean sampleNextElement() {
+ return sampler.sampleNextElement();
+ }
+
+ /** To do. This could faster be done without sorting (min and second min). */
+ private static void sortAscendingByLevel(DoubleBuffer[] fullBuffers) {
+ new ObjectArrayList(fullBuffers).quickSortFromTo(0, fullBuffers.length - 1,
+ new Comparator<Object>() {
+ @Override
+ public int compare(Object o1, Object o2) {
+ int l1 = ((Buffer) o1).level();
+ int l2 = ((Buffer) o2).level();
+ return l1 < l2 ? -1 : l1 == l2 ? 0 : 1;
+ }
+ }
+ );
+ }
+
+ /** Returns a String representation of the receiver. */
+ public String toString() {
+ StringBuffer buf = new StringBuffer(super.toString());
+ buf.setLength(buf.length() - 1);
+ return buf + ", h=" + currentTreeHeight + ", hStartSampling=" + treeHeightStartingSampling +
+ ", precomputeEpsilon=" + precomputeEpsilon + ')';
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,22 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose
+is hereby granted without fee, provided that the above copyright notice appear in all copies and
+that both that copyright notice and this permission notice appear in supporting documentation.
+CERN makes no representations about the suitability of this software for any purpose.
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+/** Holds some utility methods shared by different quantile finding implementations. */
+class Utils {
+
+ private Utils() {
+ }
+
+ /** Similar to Math.ceil(value), but adjusts small numerical rounding errors +- epsilon. */
+ public static long epsilonCeiling(double value) {
+ double epsilon = 0.0000001;
+ return (long) Math.ceil(value - epsilon);
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html Thu Dec 17 23:22:16 2009
@@ -0,0 +1,30 @@
+<HTML>
+<BODY>
+Scalable algorithms and data structures to compute approximate quantiles over very large data sequences.
+The approximation guarantees are explicit, and apply for arbitrary value distributions and arrival distributions of the
+dataset.
+The main memory requirements are smaller than for any other known technique by an order of magnitude.
+
+<p>The approx. algorithms are primarily intended to help applications scale.
+ When faced with a large data sequence, traditional methods either need very large memories or time consuming disk
+ based sorting.
+ In constrast, the approx. algorithms can deal with > 10^10 values without disk based sorting.
+
+<p>All classes can be seen from various angles, for example as
+ <dt>1. Algorithm to compute quantiles.
+ <dt>2. 1-dim-equi-depth histogram.
+ <dt>3. 1-dim-histogram arbitrarily rebinnable in real-time.
+ <dt>4. A space efficient MultiSet data structure using lossy compression.
+ <dt>5. A space efficient value preserving bin of a 2-dim or d-dim histogram.
+ <dt>(All subject to an accuracy specified by the user.)
+
+ Have a look at the documentation of class {@link org.apache.mahout.math.jet.stat.quantile.QuantileFinderFactory} and the
+ interface
+ {@link org.apache.mahout.math.jet.stat.quantile.DoubleQuantileFinder} to learn more.
+ Most users will never need to know more than how to use these.
+ Actual implementations of the <tt>QuantileFinder</tt> interface are hidden.
+ They are indirectly constructed via the the factory.
+ <br>
+ Also see {@link hep.aida.bin.QuantileBin1D}, demonstrating how this package can be used.
+</BODY>
+</HTML>
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html
------------------------------------------------------------------------------
svn:eol-style = native