You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/12/18 00:22:41 UTC

svn commit: r891983 [17/47] - in /lucene/mahout/trunk: ./ core/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/a...

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,292 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.apache.mahout.math.jet.math.Arithmetic;
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.jet.random.sampling.RandomSamplingAssistant;
+import org.apache.mahout.math.list.DoubleArrayList;
+
+/**
+ * Approximate quantile finding algorithm for known <tt>N</tt> requiring only one pass and little main memory; computes
+ * quantiles over a sequence of <tt>double</tt> elements.
+ *
+ * <p>Needs as input the following parameters:<p> <dt>1. <tt>N</tt> - the number of values of the data sequence over
+ * which quantiles are to be determined. <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. <dt>3.
+ * <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is
+ * explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory
+ * requirements. With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with
+ * respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>4. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set
+ * <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * You usually don't instantiate quantile finders by using the constructor. Instead use the factory
+ * <tt>QuantileFinderFactor</tt> to do so. It will set up the right parametrization for you.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, Approximate Medians and other Quantiles in
+ * One Pass and with Limited Memory, Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data, Paper available <A
+ * HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
+ *
+ * @see QuantileFinderFactory
+ */
+class KnownDoubleQuantileEstimator extends DoubleQuantileEstimator {
+
+  protected double beta; //correction factor for phis
+
+  protected boolean weHadMoreThanOneEmptyBuffer;
+
+  protected RandomSamplingAssistant samplingAssistant;
+  protected final double samplingRate; // see method sampleNextElement()
+  protected final long N; // see method sampleNextElement()
+
+  /**
+   * Constructs an approximate quantile finder with b buffers, each having k elements.
+   *
+   * @param b            the number of buffers
+   * @param k            the number of elements per buffer
+   * @param N            the total number of elements over which quantiles are to be computed.
+   * @param samplingRate 1.0 --> all elements are consumed. 10.0 --> Consumes one random element from successive blocks
+   *                     of 10 elements each. Etc.
+   * @param generator    a uniform random number generator.
+   */
+  KnownDoubleQuantileEstimator(int b, int k, long N, double samplingRate, RandomEngine generator) {
+    this.samplingRate = samplingRate;
+    this.N = N;
+
+    if (this.samplingRate <= 1.0) {
+      this.samplingAssistant = null;
+    } else {
+      this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N / samplingRate), N, generator);
+    }
+
+    setUp(b, k);
+    this.clear();
+  }
+
+  /**
+   * @param missingInfinities the number of infinities to fill.
+   * @param buffer            the buffer into which the infinities shall be filled.
+   */
+  protected void addInfinities(int missingInfinities, DoubleBuffer buffer) {
+    RandomSamplingAssistant oldAssistant = this.samplingAssistant;
+    this.samplingAssistant = null; // switch off sampler
+    //double[] infinities = new double[missingInfinities];
+
+    boolean even = true;
+    for (int i = 0; i < missingInfinities; i++) {
+      if (even) {
+        buffer.values.add(Double.MAX_VALUE);
+      } else {
+        buffer.values.add(-Double.MAX_VALUE);
+      }
+
+      //if (even) {infinities[i]=Double.MAX_VALUE;}
+      //else    {infinities[i]=-Double.MAX_VALUE;}
+
+      //if (even) {this.add(Double.MAX_VALUE);}
+      //else    {this.add(-Double.MAX_VALUE);}
+      even = !even;
+    }
+
+    //buffer.values.addAllOfFromTo(new DoubleArrayList(infinities),0,missingInfinities-1);
+
+    //this.totalElementsFilled -= infinities;
+
+    this.samplingAssistant = oldAssistant; // switch on sampler again
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected DoubleBuffer[] buffersToCollapse() {
+    int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
+    return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+  }
+
+  /**
+   * Removes all elements from the receiver.  The receiver will be empty after this call returns, and its memory
+   * requirements will be close to zero.
+   */
+  @Override
+  public void clear() {
+    super.clear();
+    this.beta = 1.0;
+    this.weHadMoreThanOneEmptyBuffer = false;
+    //this.setSamplingRate(samplingRate,N);
+
+    RandomSamplingAssistant assist = this.samplingAssistant;
+    if (assist != null) {
+      this.samplingAssistant =
+          new RandomSamplingAssistant(Arithmetic.floor(N / samplingRate), N, assist.getRandomGenerator());
+    }
+  }
+
+  /**
+   * Returns a deep copy of the receiver.
+   *
+   * @return a deep copy of the receiver.
+   */
+  @Override
+  public Object clone() {
+    KnownDoubleQuantileEstimator copy = (KnownDoubleQuantileEstimator) super.clone();
+    if (this.samplingAssistant != null) {
+      copy.samplingAssistant = (RandomSamplingAssistant) copy.samplingAssistant.clone();
+    }
+    return copy;
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected void newBuffer() {
+    int numberOfEmptyBuffers = this.bufferSet._getNumberOfEmptyBuffers();
+    //DoubleBuffer[] emptyBuffers = this.bufferSet._getEmptyBuffers();
+    if (numberOfEmptyBuffers == 0) {
+      throw new IllegalStateException("Oops, no empty buffer.");
+    }
+
+    this.currentBufferToFill = this.bufferSet._getFirstEmptyBuffer();
+    if (numberOfEmptyBuffers == 1 && !this.weHadMoreThanOneEmptyBuffer) {
+      this.currentBufferToFill.level(this.bufferSet._getMinLevelOfFullOrPartialBuffers());
+    } else {
+      this.weHadMoreThanOneEmptyBuffer = true;
+      this.currentBufferToFill.level(0);
+      /*
+      for (int i=0; i<emptyBuffers.length; i++) {
+        emptyBuffers[i].level = 0;
+      }
+      */
+    }
+    //currentBufferToFill.state = DoubleBuffer.PARTIAL;
+    this.currentBufferToFill.weight(1);
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected void postCollapse(DoubleBuffer[] toCollapse) {
+    this.weHadMoreThanOneEmptyBuffer = false;
+  }
+
+  /**
+   */
+  @Override
+  protected DoubleArrayList preProcessPhis(DoubleArrayList phis) {
+    if (beta > 1.0) {
+      phis = phis.copy();
+      for (int i = phis.size(); --i >= 0;) {
+        phis.set(i, (2 * phis.get(i) + beta - 1) / (2 * beta));
+      }
+    }
+    return phis;
+  }
+
+  /**
+   * Computes the specified quantile elements over the values previously added.
+   *
+   * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0].
+   *             <tt>phis</tt> must be sorted ascending.
+   * @return the approximate quantile elements.
+   */
+  @Override
+  public DoubleArrayList quantileElements(DoubleArrayList phis) {
+    /*
+  * The KNOWN quantile finder reads off quantiles from FULL buffers only.
+  * Since there might be a partially full buffer, this method first satisfies this constraint by temporarily filling a few +infinity, -infinity elements to make up a full block.
+  * This is in full conformance with the explicit approximation guarantees.
+   *
+   * For those of you working on online apps:
+    * The approximation guarantees are given for computing quantiles AFTER N elements have been filled, not for intermediate displays.
+  * If you have one thread filling and another thread displaying concurrently, you will note that in the very beginning the infinities will dominate the display.
+   * This could confuse users, because, of course, they don't expect any infinities, even if they "disappear" after a short while.
+   * To prevent panic exclude phi's close to zero or one in the early phases of processing.
+    */
+    DoubleBuffer partial = this.bufferSet._getPartialBuffer();
+    int missingValues = 0;
+    if (partial != null) { // any auxiliary infinities needed?
+      missingValues = bufferSet.k() - partial.size();
+      if (missingValues <= 0) {
+        throw new IllegalStateException("Oops! illegal missing values.");
+      }
+
+      //log.info("adding "+missingValues+" infinity elements...");
+      this.addInfinities(missingValues, partial);
+
+      //determine beta (N + Infinity values = beta * N)
+      this.beta = (this.totalElementsFilled + missingValues) / (double) this.totalElementsFilled;
+    } else {
+      this.beta = 1.0;
+    }
+
+    DoubleArrayList quantileElements = super.quantileElements(phis);
+
+    // restore state we were in before.
+    // remove the temporarily added infinities.
+    if (partial != null) {
+      removeInfinitiesFrom(missingValues, partial);
+    }
+
+    // now you can continue filling the remaining values, if any.
+    return quantileElements;
+  }
+
+  /**
+   * Reading off quantiles requires to fill some +infinity, -infinity values to make a partial buffer become full.
+   *
+   * This method removes the infinities which were previously temporarily added to a partial buffer. Removing them is
+   * necessary if we want to continue filling more elements. Precondition: the buffer is sorted ascending.
+   *
+   * @param infinities the number of infinities previously filled.
+   * @param buffer     the buffer into which the infinities were filled.
+   */
+  protected void removeInfinitiesFrom(int infinities, DoubleBuffer buffer) {
+    int plusInf = 0;
+    int minusInf = 0;
+    // count them (this is not very clever but it's safe)
+    boolean even = true;
+    for (int i = 0; i < infinities; i++) {
+      if (even) {
+        plusInf++;
+      } else {
+        minusInf++;
+      }
+      even = !even;
+    }
+
+    buffer.values.removeFromTo(buffer.size() - plusInf, buffer.size() - 1);
+    buffer.values.removeFromTo(0, minusInf - 1);
+    //this.totalElementsFilled -= infinities;
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected boolean sampleNextElement() {
+    if (samplingAssistant == null) {
+      return true;
+    }
+
+    /*
+    * This is a KNOWN N quantile finder!
+    * One should not try to fill more than N elements,
+    * because otherwise we can't give explicit approximation guarantees anymore.
+    * Use an UNKNOWN quantile finder instead if your app may fill more than N elements.
+    *
+    * However, to make this class meaningful even under wired use cases, we actually do allow to fill more than N elements (without explicit approx. guarantees, of course).
+    * Normally, elements beyond N will not get sampled because the sampler is exhausted.
+    * Therefore the histogram will no more change no matter how much you fill.
+    * This might not be what the user expects.
+    * Therefore we use a new (unexhausted) sampler with the same parametrization.
+    *
+    * If you want this class to ignore any elements beyong N, then comment the following line.
+    */
+    //if ((totalElementsFilled-1) % N == 0) setSamplingRate(samplingRate, N); // delete if appropriate
+
+    return samplingAssistant.sampleNextElement();
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/KnownDoubleQuantileEstimator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,494 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Computes b and k vor various parameters. */
+class QuantileCalc {
+
+  private static final Logger log = LoggerFactory.getLogger(QuantileCalc.class);
+
+  private QuantileCalc() {
+  }
+
+  /**
+   * Efficiently computes the binomial coefficient, often also referred to as "n over k" or "n choose k". The binomial
+   * coefficient is defined as n!/((n-k)!*k!). Tries to avoid numeric overflows.
+   *
+   * @return the binomial coefficient.
+   */
+  public static double binomial(long n, long k) {
+    if (k == 0 || k == n) {
+      return 1.0;
+    }
+
+    // since binomial(n,k)==binomial(n,n-k), we can enforce the faster variant,
+    // which is also the variant minimizing number overflows.
+    if (k > n / 2.0) {
+      k = n - k;
+    }
+
+    double binomial = 1.0;
+    long N = n - k + 1;
+    for (long i = k; i > 0;) {
+      binomial *= ((double) N++) / (double) (i--);
+    }
+    return binomial;
+  }
+
+  /**
+   * Returns the smallest <code>long &gt;= value</code>. <dt>Examples: <code>1.0 -> 1, 1.2 -> 2, 1.9 -> 2</code>. This
+   * method is safer than using (long) Math.ceil(value), because of possible rounding error.
+   */
+  public static long ceiling(double value) {
+    return Math.round(Math.ceil(value));
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability.
+   *
+   * Assumes that quantiles are to be computed over N values. The required sampling rate is computed and stored in the
+   * first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+   *
+   * @param N                  the number of values over which quantiles shall be computed (e.g <tt>10^6</tt>).
+   * @param epsilon            the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+   *                           (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta              the probability that the approximation error is more than than epsilon (e.g.
+   *                           <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set
+   *                           <tt>delta=0.0</tt>.
+   * @param quantiles          the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>).
+   *                           If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+   */
+  public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles,
+                                               double[] returnSamplingRate) {
+    if (delta > 0.0) {
+      return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+    }
+    returnSamplingRate[0] = 1.0;
+    return known_N_compute_B_and_K_quick(N, epsilon);
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with a
+   * <b>guaranteed</b> approximation error no more than epsilon. Assumes that quantiles are to be computed over N
+   * values.
+   *
+   * @param N       the anticipated number of values over which quantiles shall be determined.
+   * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer.
+   */
+  protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
+    if (epsilon <= 0.0) {
+      // no way around exact quantile search
+      long[] result = new long[2];
+      result[0] = 1;
+      result[1] = N;
+      return result;
+    }
+
+    int maxBuffers = 50;
+    int maxHeight = 50;
+    double N_double = (double) N;
+    double c = N_double * epsilon * 2.0;
+    int[] heightMaximums = new int[maxBuffers - 1];
+
+    // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+    // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+    for (int b = 2; b <= maxBuffers; b++) {
+      int h = 3;
+
+      while (h <= maxHeight && // skip heights until x<=0
+          (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+              ((double) Math.round(binomial(b + h - 3, h - 3))) +
+              ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+              > 0.0
+          ) {
+        h++;
+      }
+      //from now on x is monotonically growing...
+      while (h <= maxHeight && // skip heights until x>0
+          (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+              ((double) Math.round(binomial(b + h - 3, h - 3))) +
+              ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+              <= 0.0
+          ) {
+        h++;
+      }
+      h--; //go back to last height
+
+      // was x>0 or did we loop without finding anything?
+      int hMax;
+      if (h >= maxHeight &&
+          (h - 2) * ((double) Math.round(binomial(b + h - 2, h - 1))) -
+              ((double) Math.round(binomial(b + h - 3, h - 3))) +
+              ((double) Math.round(binomial(b + h - 3, h - 2))) - c
+              > 0.0) {
+        hMax = Integer.MIN_VALUE;
+      } else {
+        hMax = h;
+      }
+
+      heightMaximums[b - 2] = hMax; //safe some space
+    } //end for
+
+
+    // for each b, determine the smallest k satisfying the constraints, i.e.
+    // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+    long[] kMinimums = new long[maxBuffers - 1];
+    for (int b = 2; b <= maxBuffers; b++) {
+      int h = heightMaximums[b - 2];
+      long kMin = Long.MAX_VALUE;
+      if (h > Integer.MIN_VALUE) {
+        double value = ((double) Math.round(binomial(b + h - 2, h - 1)));
+        long tmpK = ceiling(N_double / value);
+        if (tmpK <= kMin) {
+          kMin = tmpK;
+        }
+      }
+      kMinimums[b - 2] = kMin;
+    }
+
+    // from all b's, determine b that minimizes b*kMin
+    long multMin = Long.MAX_VALUE;
+    int minB = -1;
+    for (int b = 2; b <= maxBuffers; b++) {
+      if (kMinimums[b - 2] < Long.MAX_VALUE) {
+        long mult = ((long) b) * kMinimums[b - 2];
+        if (mult < multMin) {
+          multMin = mult;
+          minB = b;
+        }
+      }
+    }
+
+    long b, k;
+    if (minB != -1) { // epsilon large enough?
+      b = minB;
+      k = kMinimums[minB - 2];
+    } else {     // epsilon is very small or zero.
+      b = 1; // the only possible solution without violating the
+      k = N; // approximation guarantees is exact quantile search.
+    }
+
+    long[] result = new long[2];
+    result[0] = b;
+    result[1] = k;
+    return result;
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability. Assumes that quantiles are to be computed over
+   * N values. The required sampling rate is computed and stored in the first element of the provided
+   * <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+   *
+   * @param N                  the anticipated number of values over which quantiles shall be computed (e.g 10^6).
+   * @param epsilon            the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+   *                           (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta              the probability that the approximation error is more than than epsilon (e.g.
+   *                           <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set
+   *                           <tt>delta=0.0</tt>.
+   * @param quantiles          the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>).
+   *                           If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+   */
+  protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles,
+                                                       double[] returnSamplingRate) {
+    // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+    if (epsilon <= 0.0) {
+      // no way around exact quantile search
+      long[] result = new long[2];
+      result[0] = 1;
+      result[1] = N;
+      returnSamplingRate[0] = 1.0;
+      return result;
+    }
+
+
+    int maxBuffers = 50;
+    int maxHeight = 50;
+    double N_double = N;
+
+    // One possibility is to use one buffer of size N
+    //
+    long ret_b = 1;
+    long ret_k = N;
+    double sampling_rate = 1.0;
+    long memory = N;
+
+
+    // Otherwise, there are at least two buffers (b >= 2)
+    // and the height of the tree is at least three (h >= 3)
+    //
+    // We restrict the search for b and h to MAX_BINOM, a large enough value for
+    // practical values of    epsilon >= 0.001   and    delta >= 0.00001
+    //
+    double logarithm = Math.log(2.0 * quantiles / delta);
+    double c = 2.0 * epsilon * N_double;
+    for (long b = 2; b < maxBuffers; b++) {
+      for (long h = 3; h < maxHeight; h++) {
+        double binomial = binomial(b + h - 2, h - 1);
+        long tmp = ceiling(N_double / binomial);
+        if ((b * tmp < memory) &&
+            ((h - 2) * binomial - binomial(b + h - 3, h - 3) + binomial(b + h - 3, h - 2)
+                <= c)) {
+          ret_k = tmp;
+          ret_b = b;
+          memory = ret_k * b;
+          sampling_rate = 1.0;
+        }
+        if (delta > 0.0) {
+          double t = (h - 2) * binomial(b + h - 2, h - 1) - binomial(b + h - 3, h - 3) + binomial(b + h - 3, h - 2);
+          double u = logarithm / epsilon;
+          double v = binomial(b + h - 2, h - 1);
+          double w = logarithm / (2.0 * epsilon * epsilon);
+
+          // From our SIGMOD 98 paper, we have two equantions to satisfy:
+          // t  <= u * alpha/(1-alpha)^2
+          // kv >= w/(1-alpha)^2
+          //
+          // Denoting 1/(1-alpha)    by x,
+          // we see that the first inequality is equivalent to
+          // t/u <= x^2 - x
+          // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+          // Plugging in this value into second equation yields
+          // k >= wx^2/v
+
+          double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0 * t / u);
+          long k = ceiling(w * x * x / v);
+          if (b * k < memory) {
+            ret_k = k;
+            ret_b = b;
+            memory = b * k;
+            sampling_rate = N_double * 2.0 * epsilon * epsilon / logarithm;
+          }
+        }
+      }
+    }
+
+    long[] result = new long[2];
+    result[0] = ret_b;
+    result[1] = ret_k;
+    returnSamplingRate[0] = sampling_rate;
+    return result;
+  }
+
+  public static void main(String[] args) {
+    test_B_and_K_Calculation(args);
+  }
+
+  /** Computes b and k for different parameters. */
+  public static void test_B_and_K_Calculation(String[] args) {
+    boolean known_N;
+    if (args == null) {
+      known_N = false;
+    } else {
+      known_N = Boolean.valueOf(args[0]);
+    }
+
+    int[] quantiles = {1, 1000};
+
+    long[] sizes = {100000, 1000000, 10000000, 1000000000};
+
+    double[] deltas = {0.0, 0.001, 0.0001, 0.00001};
+
+    double[] epsilons = {0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000001};
+
+
+    if (!known_N) {
+      sizes = new long[]{0};
+    }
+    log.info("\n\n");
+    if (known_N) {
+      log.info("Computing b's and k's for KNOWN N");
+    } else {
+      log.info("Computing b's and k's for UNKNOWN N");
+    }
+    log.info("mem [elements/1024]");
+    log.info("***********************************");
+
+    for (int p : quantiles) {
+      log.info("------------------------------");
+      log.info("computing for p = " + p);
+      for (long N : sizes) {
+        log.info("   ------------------------------");
+        log.info("   computing for N = " + N);
+        for (double delta : deltas) {
+          log.info("      ------------------------------");
+          log.info("      computing for delta = " + delta);
+          for (double epsilon : epsilons) {
+            double[] returnSamplingRate = new double[1];
+            long[] result;
+            if (known_N) {
+              result = known_N_compute_B_and_K(N, epsilon, delta, p, returnSamplingRate);
+            } else {
+              result = unknown_N_compute_B_and_K(epsilon, delta, p);
+            }
+
+            long b = result[0];
+            long k = result[1];
+            log.info("         (e,d,N,p)=(" + epsilon + ',' + delta + ',' + N + ',' + p + ") --> ");
+            log.info("(b,k,mem");
+            if (known_N) {
+              log.info(",sampling");
+            }
+            log.info(")=(" + b + ',' + k + ',' + (b * k / 1024));
+            if (known_N) {
+              log.info("," + returnSamplingRate[0]);
+            }
+            log.info(")");
+          }
+        }
+      }
+    }
+
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability.
+   *
+   * @param epsilon   the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                  epsilon &lt;= 1</tt>). To get exact results, set <tt>epsilon=0.0</tt>;
+   * @param delta     the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+   *                  (<tt>0 &lt;= delta &lt;= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+   * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If
+   *                  unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @return <tt>long[3]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>long[2]</tt>=the tree height where sampling shall start.
+   */
+  public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
+    // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+    if (epsilon <= 0.0 || delta <= 0.0) {
+      // no way around exact quantile search
+      long[] result = new long[3];
+      result[0] = 1;
+      result[1] = Long.MAX_VALUE;
+      result[2] = Long.MAX_VALUE;
+      return result;
+    }
+
+    int max_b = 50;
+    int max_h = 50;
+    int max_H = 50;
+    int max_Iterations = 2;
+
+    long best_b = Long.MAX_VALUE;
+    long best_k = Long.MAX_VALUE;
+    long best_h = Long.MAX_VALUE;
+    long best_memory = Long.MAX_VALUE;
+
+    double pow = Math.pow(2.0, max_H);
+    double logDelta = Math.log(2.0 / (delta / quantiles)) / (2.0 * epsilon * epsilon);
+    //double logDelta =  Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+    while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+      // identify that combination of b and h that minimizes b*k.
+      // exhaustive search.
+      for (int b = 2; b <= max_b; b++) {
+        for (int h = 2; h <= max_h; h++) {
+          double Ld = binomial(b + h - 2, h - 1);
+          double Ls = binomial(b + h - 3, h - 1);
+
+          // now we have k>=c*(1-alpha)^-2.
+          // let's compute c.
+          //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+          double c = logDelta / Math.min(Ld, 8.0 * Ls / 3.0);
+
+          // now we have k>=d/alpha.
+          // let's compute d.
+          double beta = Ld / Ls;
+          double cc = (beta - 2.0) * (max_H - 2.0) / (beta + pow - 2.0);
+          double d = (h + 3 + cc) / (2.0 * epsilon);
+
+          /*
+          double d = (Ld*(h+max_H-1.0)  +  Ls*((h+1)*pow - 2.0*(h+max_H)))   /   (Ld + Ls*(pow-2.0));
+          d = (d + 2.0) / (2.0*epsilon);
+          */
+
+          // now we have c*(1-alpha)^-2 == d/alpha.
+          // we solve this equation for alpha yielding two solutions
+          // alpha_1,2 = (c + 2*d  +-  Sqrt(c*c + 4*c*d))/(2*d)
+          double f = c * c + 4.0 * c * d;
+          if (f < 0.0) {
+            continue;
+          } // non real solution to equation
+          double root = Math.sqrt(f);
+          double alpha_one = (c + 2.0 * d + root) / (2.0 * d);
+          double alpha_two = (c + 2.0 * d - root) / (2.0 * d);
+
+          // any alpha must satisfy 0<alpha<1 to yield valid solutions
+          boolean alpha_one_OK = false;
+          if (0.0 < alpha_one && alpha_one < 1.0) {
+            alpha_one_OK = true;
+          }
+          boolean alpha_two_OK = false;
+          if (0.0 < alpha_two && alpha_two < 1.0) {
+            alpha_two_OK = true;
+          }
+          if (alpha_one_OK || alpha_two_OK) {
+            double alpha = alpha_one;
+            if (alpha_one_OK && alpha_two_OK) {
+              // take the alpha that minimizes d/alpha
+              alpha = Math.max(alpha_one, alpha_two);
+            } else if (alpha_two_OK) {
+              alpha = alpha_two;
+            }
+
+            // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+            long k = ceiling(Math.max(d / alpha, (h + 1) / (2.0 * epsilon)));
+            if (k > 0) { // valid solution?
+              long memory = b * k;
+              if (memory < best_memory) {
+                // found a solution requiring less memory
+                best_k = k;
+                best_b = b;
+                best_h = h;
+                best_memory = memory;
+              }
+            }
+          }
+        } //end for h
+      } //end for b
+
+      if (best_b == Long.MAX_VALUE) {
+        log.info("Warning: Computing b and k looks like a lot of work!");
+        // no solution found so far. very unlikely. Anyway, try again.
+        max_b *= 2;
+        max_h *= 2;
+        max_H *= 2;
+      }
+    } //end while
+
+    long[] result = new long[3];
+    if (best_b == Long.MAX_VALUE) {
+      // no solution found.
+      // no way around exact quantile search.
+      result[0] = 1;
+      result[1] = Long.MAX_VALUE;
+      result[2] = Long.MAX_VALUE;
+    } else {
+      result[0] = best_b;
+      result[1] = best_k;
+      result[2] = best_h;
+    }
+
+    return result;
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileCalc.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,648 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+//import cern.it.util.Doubles;
+
+import org.apache.mahout.math.jet.math.Arithmetic;
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * Factory constructing exact and approximate quantile finders for both known and unknown <tt>N</tt>.
+ * Also see {@link hep.aida.bin.QuantileBin1D}, demonstrating how this package can be used.
+ *
+ * The approx. algorithms compute approximate quantiles of large data sequences in a single pass.
+ * The approximation guarantees are explicit, and apply for arbitrary value distributions and arrival distributions of the data sequence.
+ * The main memory requirements are smaller than for any other known technique by an order of magnitude.
+ *
+ * <p>The approx. algorithms are primarily intended to help applications scale.
+ * When faced with a large data sequences, traditional methods either need very large memories or time consuming disk based sorting.
+ * In constrast, the approx. algorithms can deal with > 10^10 values without disk based sorting.
+ *
+ * <p>All classes can be seen from various angles, for example as
+ * <dt>1. Algorithm to compute quantiles.
+ * <dt>2. 1-dim-equi-depth histogram.
+ * <dt>3. 1-dim-histogram arbitrarily rebinnable in real-time.
+ * <dt>4. A space efficient MultiSet data structure using lossy compression.
+ * <dt>5. A space efficient value preserving bin of a 2-dim or d-dim histogram.
+ * <dt>(All subject to an accuracy specified by the user.)
+ *
+ * <p>Use methods <tt>newXXX(...)</tt> to get new instances of one of the following quantile finders.
+ *
+ * <p><b>1. Exact quantile finding algorithm for known and unknown <tt>N</tt> requiring large main memory.</b></p>
+ * The folkore algorithm: Keeps all elements in main memory, sorts the list, then picks the quantiles.
+ *
+ *
+ *
+ *
+ * <p><p><b>2. Approximate quantile finding algorithm for known <tt>N</tt> requiring only one pass and little main memory.</b></p>
+ *
+ * <p>Needs as input the following parameters:<p>
+ * <dt>1. <tt>N</tt> - the number of values of the data sequence over which quantiles are to be determined.
+ * <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+ * <dt>3. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory requirements. 
+ * With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>4. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, 
+ * Approximate Medians and other Quantiles in One Pass and with Limited Memory,
+ * Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data,
+ * Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku/papers/quantiles.ps.gz"> here</A>.
+ *
+ *
+ *
+ *
+ * <p><p><b>3. Approximate quantile finding algorithm for unknown <tt>N</tt> requiring only one pass and little main memory.</b></p>
+ * This algorithm requires at most two times the memory of a corresponding approx. quantile finder knowing <tt>N</tt>.
+ *
+ * <p>Needs as input the following parameters:<p>
+ * <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed. If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 1000</tt>.
+ * <dt>2. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory requirements. 
+ * With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>3. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay,
+ * Random Sampling Techniques for Space Efficient Online Computation of Order Statistics of Large Datasets.
+ * Proc. of the 1999 ACM SIGMOD Int. Conf. on Management of Data,
+ * Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku/papers/unknown.ps.gz"> here</A>.
+ *
+ * <p><b>Example usage:</b>
+ *
+ *<pre>
+ * _TODO_
+ *</pre><p>
+ *
+ *
+ * @see KnownDoubleQuantileEstimator
+ * @see UnknownDoubleQuantileEstimator
+ */
+
+/** @deprecated until unit tests are in place.  Until this time, this class/interface is unsupported. */
+@Deprecated
+public class QuantileFinderFactory {
+
+  private static final Logger log = LoggerFactory.getLogger(QuantileFinderFactory.class);
+
+  /** Make this class non instantiable. Let still allow others to inherit. */
+  private QuantileFinderFactory() {
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability.
+   *
+   * Assumes that quantiles are to be computed over N values. The required sampling rate is computed and stored in the
+   * first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+   *
+   * @param N                  the number of values over which quantiles shall be computed (e.g <tt>10^6</tt>).
+   * @param epsilon            the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+   *                           (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta              the probability that the approximation error is more than than epsilon (e.g.
+   *                           <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set
+   *                           <tt>delta=0.0</tt>.
+   * @param quantiles          the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>).
+   *                           If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @param returnSamplingRate output parameter, a <tt>double[1]</tt> where the sampling rate is to be filled in.
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+   */
+  public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles,
+                                               double[] returnSamplingRate) {
+    returnSamplingRate[0] = 1.0;
+    if (epsilon <= 0.0) {
+      // no way around exact quantile search
+      long[] result = new long[2];
+      result[0] = 1;
+      result[1] = N;
+      return result;
+    }
+    if (epsilon >= 1.0 || delta >= 1.0) {
+      // can make any error we wish
+      long[] result = new long[2];
+      result[0] = 2;
+      result[1] = 1;
+      return result;
+    }
+
+    if (delta > 0.0) {
+      return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+    }
+    return known_N_compute_B_and_K_quick(N, epsilon);
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with a
+   * <b>guaranteed</b> approximation error no more than epsilon. Assumes that quantiles are to be computed over N
+   * values.
+   *
+   * @param N       the anticipated number of values over which quantiles shall be determined.
+   * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer.
+   */
+  protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
+    int maxBuffers = 50;
+    int maxHeight = 50;
+    double N_double = (double) N;
+    double c = N_double * epsilon * 2.0;
+    int[] heightMaximums = new int[maxBuffers - 1];
+
+    // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+    // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+    for (int b = 2; b <= maxBuffers; b++) {
+      int h = 3;
+
+      while (h <= maxHeight && // skip heights until x<=0
+          (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+              (Arithmetic.binomial(b + h - 3, h - 3)) +
+              (Arithmetic.binomial(b + h - 3, h - 2)) - c
+              > 0.0
+          ) {
+        h++;
+      }
+      //from now on x is monotonically growing...
+      while (h <= maxHeight && // skip heights until x>0
+          (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+              (Arithmetic.binomial(b + h - 3, h - 3)) +
+              (Arithmetic.binomial(b + h - 3, h - 2)) - c
+              <= 0.0
+          ) {
+        h++;
+      }
+      h--; //go back to last height
+
+      // was x>0 or did we loop without finding anything?
+      int hMax;
+      if (h >= maxHeight &&
+          (h - 2) * (Arithmetic.binomial(b + h - 2, h - 1)) -
+              (Arithmetic.binomial(b + h - 3, h - 3)) +
+              (Arithmetic.binomial(b + h - 3, h - 2)) - c
+              > 0.0) {
+        hMax = Integer.MIN_VALUE;
+      } else {
+        hMax = h;
+      }
+
+      heightMaximums[b - 2] = hMax; //safe some space
+    } //end for
+
+
+    // for each b, determine the smallest k satisfying the constraints, i.e.
+    // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+    long[] kMinimums = new long[maxBuffers - 1];
+    for (int b = 2; b <= maxBuffers; b++) {
+      int h = heightMaximums[b - 2];
+      long kMin = Long.MAX_VALUE;
+      if (h > Integer.MIN_VALUE) {
+        double value = (Arithmetic.binomial(b + h - 2, h - 1));
+        long tmpK = (long) (Math.ceil(N_double / value));
+        if (tmpK <= kMin) {
+          kMin = tmpK;
+        }
+      }
+      kMinimums[b - 2] = kMin;
+    }
+
+    // from all b's, determine b that minimizes b*kMin
+    long multMin = Long.MAX_VALUE;
+    int minB = -1;
+    for (int b = 2; b <= maxBuffers; b++) {
+      if (kMinimums[b - 2] < Long.MAX_VALUE) {
+        long mult = ((long) b) * kMinimums[b - 2];
+        if (mult < multMin) {
+          multMin = mult;
+          minB = b;
+        }
+      }
+    }
+
+    long b, k;
+    if (minB != -1) { // epsilon large enough?
+      b = minB;
+      k = kMinimums[minB - 2];
+    } else {     // epsilon is very small or zero.
+      b = 1; // the only possible solution without violating the
+      k = N; // approximation guarantees is exact quantile search.
+    }
+
+    long[] result = new long[2];
+    result[0] = b;
+    result[1] = k;
+    return result;
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability. Assumes that quantiles are to be computed over
+   * N values. The required sampling rate is computed and stored in the first element of the provided
+   * <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+   *
+   * @param N                  the anticipated number of values over which quantiles shall be computed (e.g 10^6).
+   * @param epsilon            the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>)
+   *                           (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta              the probability that the approximation error is more than than epsilon (e.g.
+   *                           <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set
+   *                           <tt>delta=0.0</tt>.
+   * @param quantiles          the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>).
+   *                           If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @param returnSamplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+   * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+   */
+  protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles,
+                                                       double[] returnSamplingRate) {
+    int maxBuffers = 50;
+    int maxHeight = 50;
+    double N_double = N;
+
+    // One possibility is to use one buffer of size N
+    //
+    long ret_b = 1;
+    long ret_k = N;
+    double sampling_rate = 1.0;
+    long memory = N;
+
+
+    // Otherwise, there are at least two buffers (b >= 2)
+    // and the height of the tree is at least three (h >= 3)
+    //
+    // We restrict the search for b and h to MAX_BINOM, a large enough value for
+    // practical values of    epsilon >= 0.001   and    delta >= 0.00001
+    //
+    double logarithm = Math.log(2.0 * quantiles / delta);
+    double c = 2.0 * epsilon * N_double;
+    for (long b = 2; b < maxBuffers; b++) {
+      for (long h = 3; h < maxHeight; h++) {
+        double binomial = Arithmetic.binomial(b + h - 2, h - 1);
+        long tmp = (long) Math.ceil(N_double / binomial);
+        if ((b * tmp < memory) &&
+            ((h - 2) * binomial - Arithmetic.binomial(b + h - 3, h - 3) + Arithmetic.binomial(b + h - 3, h - 2)
+                <= c)) {
+          ret_k = tmp;
+          ret_b = b;
+          memory = ret_k * b;
+          sampling_rate = 1.0;
+        }
+        if (delta > 0.0) {
+          double t = (h - 2) * Arithmetic.binomial(b + h - 2, h - 1) - Arithmetic.binomial(b + h - 3, h - 3) +
+              Arithmetic.binomial(b + h - 3, h - 2);
+          double u = logarithm / epsilon;
+          double v = Arithmetic.binomial(b + h - 2, h - 1);
+          double w = logarithm / (2.0 * epsilon * epsilon);
+
+          // From our SIGMOD 98 paper, we have two equantions to satisfy:
+          // t  <= u * alpha/(1-alpha)^2
+          // kv >= w/(1-alpha)^2
+          //
+          // Denoting 1/(1-alpha)    by x,
+          // we see that the first inequality is equivalent to
+          // t/u <= x^2 - x
+          // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+          // Plugging in this value into second equation yields
+          // k >= wx^2/v
+
+          double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0 * t / u);
+          long k = (long) Math.ceil(w * x * x / v);
+          if (b * k < memory) {
+            ret_k = k;
+            ret_b = b;
+            memory = b * k;
+            sampling_rate = N_double * 2.0 * epsilon * epsilon / logarithm;
+          }
+        }
+      }
+    }
+
+    long[] result = new long[2];
+    result[0] = ret_b;
+    result[1] = ret_k;
+    returnSamplingRate[0] = sampling_rate;
+    return result;
+  }
+
+  /**
+   * Returns a quantile finder that minimizes the amount of memory needed under the user provided constraints.
+   *
+   * Many applications don't know in advance over how many elements quantiles are to be computed. However, some of them
+   * can give an upper limit, which will assist the factory in choosing quantile finders with minimal memory
+   * requirements. For example if you select values from a database and fill them into histograms, then you probably
+   * don't know how many values you will fill, but you probably do know that you will fill at most <tt>S</tt> elements,
+   * the size of your database.
+   *
+   * @param known_N   specifies whether the number of elements over which quantiles are to be computed is known or not.
+   * @param N         if <tt>known_N==true</tt>, the number of elements over which quantiles are to be computed. if
+   *                  <tt>known_N==false</tt>, the upper limit on the number of elements over which quantiles are to be
+   *                  computed. If such an upper limit is a-priori unknown, then set <tt>N = Long.MAX_VALUE</tt>.
+   * @param epsilon   the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                  epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta     the probability that the approximation error is more than than epsilon (e.g. 0.0001) (0 &lt;=
+   *                  delta &lt;= 1). To avoid probabilistic answers, set <tt>delta=0.0</tt>.
+   * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If
+   *                  unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @param generator a uniform random number generator. Set this parameter to <tt>null</tt> to use a default
+   *                  generator.
+   * @return the quantile finder minimizing memory requirements under the given constraints.
+   */
+  public static DoubleQuantileFinder newDoubleQuantileFinder(boolean known_N, long N, double epsilon, double delta,
+                                                             int quantiles, RandomEngine generator) {
+    //boolean known_N = true;
+    //if (N==Long.MAX_VALUE) known_N = false;
+    // check parameters.
+    // if they are illegal, keep quite and return an exact finder.
+    if (epsilon <= 0.0 || N < 1000) {
+      return new ExactDoubleQuantileFinder();
+    }
+    if (epsilon > 1) {
+      epsilon = 1;
+    }
+    if (delta < 0) {
+      delta = 0;
+    }
+    if (delta > 1) {
+      delta = 1;
+    }
+    if (quantiles < 1) {
+      quantiles = 1;
+    }
+    if (quantiles > N) {
+      N = quantiles;
+    }
+
+    //KnownDoubleQuantileEstimator finder;
+    if (known_N) {
+      double[] samplingRate = new double[1];
+      long[] resultKnown = known_N_compute_B_and_K(N, epsilon, delta, quantiles, samplingRate);
+      long b = resultKnown[0];
+      long k = resultKnown[1];
+      if (b == 1) {
+        return new ExactDoubleQuantileFinder();
+      }
+      return new KnownDoubleQuantileEstimator((int) b, (int) k, N, samplingRate[0], generator);
+    } else {
+      long[] resultUnknown = unknown_N_compute_B_and_K(epsilon, delta, quantiles);
+      long b1 = resultUnknown[0];
+      long k1 = resultUnknown[1];
+      long h1 = resultUnknown[2];
+      double preComputeEpsilon = -1.0;
+      if (resultUnknown[3] == 1) {
+        preComputeEpsilon = epsilon;
+      }
+
+      //if (N==Long.MAX_VALUE) { // no maximum N provided by user.
+
+      // if (true) fixes bug reported by LarryPeranich@fairisaac.com
+      //if (true) { // no maximum N provided by user.
+      if (b1 == 1) {
+        return new ExactDoubleQuantileFinder();
+      }
+      return new UnknownDoubleQuantileEstimator((int) b1, (int) k1, (int) h1, preComputeEpsilon, generator);
+      //}
+
+      /*
+      // determine whether UnknownFinder or KnownFinder with maximum N requires less memory.
+      double[] samplingRate = new double[1];
+
+      // IMPORTANT: for known finder, switch sampling off (delta == 0) !!!
+      // with knownN-sampling we can only guarantee the errors if the input sequence has EXACTLY N elements.
+      // with knownN-no sampling we can also guarantee the errors for sequences SMALLER than N elements.
+      long[] resultKnown = known_N_compute_B_and_K(N, epsilon, 0, quantiles, samplingRate);
+
+      long b2 = resultKnown[0];
+      long k2 = resultKnown[1];
+
+      if (b2 * k2 < b1 * k1) { // the KnownFinder is smaller
+        if (b2 == 1) return new ExactDoubleQuantileFinder();
+        return new KnownDoubleQuantileEstimator((int) b2, (int) k2, N, samplingRate[0], generator);
+      }
+
+      // the UnknownFinder is smaller
+      if (b1 == 1) return new ExactDoubleQuantileFinder();
+      return new UnknownDoubleQuantileEstimator((int) b1, (int) k1, (int) h1, preComputeEpsilon, generator);
+       */
+    }
+  }
+
+  /**
+   * Convenience method that computes phi's for equi-depth histograms. This is simply a list of numbers with <tt>i /
+   * (double)quantiles</tt> for <tt>i={1,2,...,quantiles-1}</tt>.
+   *
+   * @return the equi-depth phi's
+   */
+  public static org.apache.mahout.math.list.DoubleArrayList newEquiDepthPhis(int quantiles) {
+    DoubleArrayList phis =
+        new DoubleArrayList(quantiles - 1);
+    for (int i = 1; i <= quantiles - 1; i++) {
+      phis.add(i / (double) quantiles);
+    }
+    return phis;
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability.
+   *
+   * @param epsilon   the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                  epsilon &lt;= 1</tt>). To get exact results, set <tt>epsilon=0.0</tt>;
+   * @param delta     the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+   *                  (<tt>0 &lt;= delta &lt;= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+   * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If
+   *                  unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is
+   *         better, otherwise 0;
+   */
+  public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
+    return unknown_N_compute_B_and_K_raw(epsilon, delta, quantiles);
+    // move stuff from _raw(..) here and delete _raw(...)
+
+    /*
+    long[] result_1 = unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
+    long b1 = result_1[0];
+    long k1 = result_1[1];
+
+
+    int quantilesToPrecompute = (int) Doubles.ceiling(1.0 / epsilon);
+
+    if (quantiles>quantilesToPrecompute) {
+      // try if precomputing quantiles requires less memory.
+      long[] result_2 = unknown_N_compute_B_and_K_raw(epsilon/2.0,delta,quantilesToPrecompute);
+
+      long b2 = result_2[0];
+      long k2 = result_2[1];
+      if (b2*k2 < b1*k1) {
+        result_2[3] = 1; //precomputation is better
+        result_1 = result_2;
+      }
+    }
+    return result_1;
+    */
+  }
+
+  /**
+   * Computes the number of buffers and number of values per buffer such that quantiles can be determined with an
+   * approximation error no more than epsilon with a certain probability. <b>You never need to call this method.</b> It
+   * is only for curious users wanting to gain some insight into the workings of the algorithms.
+   *
+   * @param epsilon   the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;=
+   *                  epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+   * @param delta     the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>)
+   *                  (<tt>0 &lt;= delta &lt;= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+   * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If
+   *                  unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+   * @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per
+   *         buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is
+   *         better, otherwise 0;
+   */
+  protected static long[] unknown_N_compute_B_and_K_raw(double epsilon, double delta, int quantiles) {
+    // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+    if (epsilon <= 0.0) {
+      long[] result = new long[4];
+      result[0] = 1;
+      result[1] = Long.MAX_VALUE;
+      result[2] = Long.MAX_VALUE;
+      result[3] = 0;
+      return result;
+    }
+    if (epsilon >= 1.0 || delta >= 1.0) {
+      // can make any error we wish
+      long[] result = new long[4];
+      result[0] = 2;
+      result[1] = 1;
+      result[2] = 3;
+      result[3] = 0;
+      return result;
+    }
+    if (delta <= 0.0) {
+      // no way around exact quantile search
+      long[] result = new long[4];
+      result[0] = 1;
+      result[1] = Long.MAX_VALUE;
+      result[2] = Long.MAX_VALUE;
+      result[3] = 0;
+      return result;
+    }
+
+    int max_b = 50;
+    int max_h = 50;
+    int max_H = 50;
+    int max_Iterations = 2;
+
+    long best_b = Long.MAX_VALUE;
+    long best_k = Long.MAX_VALUE;
+    long best_h = Long.MAX_VALUE;
+    long best_memory = Long.MAX_VALUE;
+
+    double pow = Math.pow(2.0, max_H);
+    double logDelta = Math.log(2.0 / (delta / quantiles)) / (2.0 * epsilon * epsilon);
+    //double logDelta =  Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+    while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+      // identify that combination of b and h that minimizes b*k.
+      // exhaustive search.
+      for (int b = 2; b <= max_b; b++) {
+        for (int h = 2; h <= max_h; h++) {
+          double Ld = Arithmetic.binomial(b + h - 2, h - 1);
+          double Ls = Arithmetic.binomial(b + h - 3, h - 1);
+
+          // now we have k>=c*(1-alpha)^-2.
+          // let's compute c.
+          //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+          double c = logDelta / Math.min(Ld, 8.0 * Ls / 3.0);
+
+          // now we have k>=d/alpha.
+          // let's compute d.
+          double beta = Ld / Ls;
+          double cc = (beta - 2.0) * (max_H - 2.0) / (beta + pow - 2.0);
+          double d = (h + 3 + cc) / (2.0 * epsilon);
+
+          /*
+          double d = (Ld*(h+max_H-1.0)  +  Ls*((h+1)*pow - 2.0*(h+max_H)))   /   (Ld + Ls*(pow-2.0));
+          d = (d + 2.0) / (2.0*epsilon);
+          */
+
+          // now we have c*(1-alpha)^-2 == d/alpha.
+          // we solve this equation for alpha yielding two solutions
+          // alpha_1,2 = (c + 2*d  +-  Sqrt(c*c + 4*c*d))/(2*d)
+          double f = c * c + 4.0 * c * d;
+          if (f < 0.0) {
+            continue;
+          } // non real solution to equation
+          double root = Math.sqrt(f);
+          double alpha_one = (c + 2.0 * d + root) / (2.0 * d);
+          double alpha_two = (c + 2.0 * d - root) / (2.0 * d);
+
+          // any alpha must satisfy 0<alpha<1 to yield valid solutions
+          boolean alpha_one_OK = false;
+          if (0.0 < alpha_one && alpha_one < 1.0) {
+            alpha_one_OK = true;
+          }
+          boolean alpha_two_OK = false;
+          if (0.0 < alpha_two && alpha_two < 1.0) {
+            alpha_two_OK = true;
+          }
+          if (alpha_one_OK || alpha_two_OK) {
+            double alpha = alpha_one;
+            if (alpha_one_OK && alpha_two_OK) {
+              // take the alpha that minimizes d/alpha
+              alpha = Math.max(alpha_one, alpha_two);
+            } else if (alpha_two_OK) {
+              alpha = alpha_two;
+            }
+
+            // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+            long k = (long) Math.ceil(Math.max(d / alpha, (h + 1) / (2.0 * epsilon)));
+            if (k > 0) { // valid solution?
+              long memory = b * k;
+              if (memory < best_memory) {
+                // found a solution requiring less memory
+                best_k = k;
+                best_b = b;
+                best_h = h;
+                best_memory = memory;
+              }
+            }
+          }
+        } //end for h
+      } //end for b
+
+      if (best_b == Long.MAX_VALUE) {
+        log.warn("Computing b and k looks like a lot of work!");
+        // no solution found so far. very unlikely. Anyway, try again.
+        max_b *= 2;
+        max_h *= 2;
+        max_H *= 2;
+      }
+    } //end while
+
+    long[] result = new long[4];
+    result[3] = 0;
+    if (best_b == Long.MAX_VALUE) {
+      // no solution found.
+      // no way around exact quantile search.
+      result[0] = 1;
+      result[1] = Long.MAX_VALUE;
+      result[2] = Long.MAX_VALUE;
+    } else {
+      result[0] = best_b;
+      result[1] = best_k;
+      result[2] = best_h;
+    }
+
+    return result;
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/QuantileFinderFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,188 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+import org.apache.mahout.math.jet.random.engine.RandomEngine;
+import org.apache.mahout.math.jet.random.sampling.WeightedRandomSampler;
+import org.apache.mahout.math.list.DoubleArrayList;
+import org.apache.mahout.math.list.ObjectArrayList;
+
+import java.util.Comparator;
+
+/**
+ * Approximate quantile finding algorithm for unknown <tt>N</tt> requiring only one pass and little main memory;
+ * computes quantiles over a sequence of <tt>double</tt> elements. This algorithm requires at most two times the memory
+ * of a corresponding approx. quantile finder knowing <tt>N</tt>.
+ *
+ * <p>Needs as input the following parameters:<p> <dt>1. <tt>quantiles</tt> - the number of quantiles to be computed.
+ * <dt>2. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm
+ * is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory
+ * requirements. With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with
+ * respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>3. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set
+ * <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * You usually don't instantiate quantile finders by using the constructor. Instead use the factory
+ * <tt>QuantileFinderFactor</tt> to do so. It will set up the right parametrization for you.
+ *
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, Random Sampling Techniques for Space
+ * Efficient Online Computation of Order Statistics of Large Datasets. Accepted for Proc. of the 1999 ACM SIGMOD Int.
+ * Conf. on Management of Data, Paper (soon) available <A HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
+ *
+ * @see QuantileFinderFactory
+ */
+class UnknownDoubleQuantileEstimator extends DoubleQuantileEstimator {
+
+  private int currentTreeHeight;
+  private final int treeHeightStartingSampling;
+  private WeightedRandomSampler sampler;
+  private final double precomputeEpsilon;
+
+  /**
+   * Constructs an approximate quantile finder with b buffers, each having k elements.
+   *
+   * @param b                 the number of buffers
+   * @param k                 the number of elements per buffer
+   * @param h                 the tree height at which sampling shall start.
+   * @param precomputeEpsilon the epsilon for which quantiles shall be precomputed; set this value <=0.0 if nothing
+   *                          shall be precomputed.
+   * @param generator         a uniform random number generator.
+   */
+  UnknownDoubleQuantileEstimator(int b, int k, int h, double precomputeEpsilon, RandomEngine generator) {
+    this.sampler = new WeightedRandomSampler(1, generator);
+    setUp(b, k);
+    this.treeHeightStartingSampling = h;
+    this.precomputeEpsilon = precomputeEpsilon;
+    this.clear();
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected DoubleBuffer[] buffersToCollapse() {
+    DoubleBuffer[] fullBuffers = bufferSet._getFullOrPartialBuffers();
+
+    sortAscendingByLevel(fullBuffers);
+
+    // if there is only one buffer at the lowest level, then increase its level so that there are at least two at the lowest level.
+    int minLevel = fullBuffers[1].level();
+    if (fullBuffers[0].level() < minLevel) {
+      fullBuffers[0].level(minLevel);
+    }
+
+    return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+  }
+
+  /**
+   * Removes all elements from the receiver.  The receiver will be empty after this call returns, and its memory
+   * requirements will be close to zero.
+   */
+  @Override
+  public synchronized void clear() {
+    super.clear();
+    this.currentTreeHeight = 1;
+    this.sampler.setWeight(1);
+  }
+
+  /**
+   * Returns a deep copy of the receiver.
+   *
+   * @return a deep copy of the receiver.
+   */
+  @Override
+  public Object clone() {
+    UnknownDoubleQuantileEstimator copy = (UnknownDoubleQuantileEstimator) super.clone();
+    if (this.sampler != null) {
+      copy.sampler = (WeightedRandomSampler) copy.sampler.clone();
+    }
+    return copy;
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected void newBuffer() {
+    currentBufferToFill = bufferSet._getFirstEmptyBuffer();
+    if (currentBufferToFill == null) {
+      throw new IllegalStateException("Oops, no empty buffer.");
+    }
+
+    currentBufferToFill.level(currentTreeHeight - 1);
+    currentBufferToFill.weight(sampler.getWeight());
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected void postCollapse(DoubleBuffer[] toCollapse) {
+    if (toCollapse.length == bufferSet.b()) { //delta for unknown finder
+      currentTreeHeight++;
+      if (currentTreeHeight >= treeHeightStartingSampling) {
+        sampler.setWeight(sampler.getWeight() * 2);
+      }
+    }
+  }
+
+  /**
+   * Computes the specified quantile elements over the values previously added.
+   *
+   * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval (0.0,1.0].
+   *             <tt>phis</tt> must be sorted ascending.
+   * @return the approximate quantile elements.
+   */
+  @Override
+  public DoubleArrayList quantileElements(DoubleArrayList phis) {
+    if (precomputeEpsilon <= 0.0) {
+      return super.quantileElements(phis);
+    }
+
+    int quantilesToPrecompute = (int) Utils.epsilonCeiling(1.0 / precomputeEpsilon);
+
+    //select that quantile from the precomputed set that corresponds to a position closest to phi.
+    phis = phis.copy();
+    double e = precomputeEpsilon;
+    for (int index = phis.size(); --index >= 0;) {
+      double phi = phis.get(index);
+      int i = (int) Math.round(((2.0 * phi / e) - 1.0) / 2.0); // finds closest
+      i = Math.min(quantilesToPrecompute - 1, Math.max(0, i));
+      double augmentedPhi = (e / 2.0) * (1 + 2 * i);
+      phis.set(index, augmentedPhi);
+    }
+
+    return super.quantileElements(phis);
+  }
+
+  /** Not yet commented. */
+  @Override
+  protected boolean sampleNextElement() {
+    return sampler.sampleNextElement();
+  }
+
+  /** To do. This could faster be done without sorting (min and second min). */
+  private static void sortAscendingByLevel(DoubleBuffer[] fullBuffers) {
+    new ObjectArrayList(fullBuffers).quickSortFromTo(0, fullBuffers.length - 1,
+        new Comparator<Object>() {
+          @Override
+          public int compare(Object o1, Object o2) {
+            int l1 = ((Buffer) o1).level();
+            int l2 = ((Buffer) o2).level();
+            return l1 < l2 ? -1 : l1 == l2 ? 0 : 1;
+          }
+        }
+    );
+  }
+
+  /** Returns a String representation of the receiver. */
+  public String toString() {
+    StringBuffer buf = new StringBuffer(super.toString());
+    buf.setLength(buf.length() - 1);
+    return buf + ", h=" + currentTreeHeight + ", hStartSampling=" + treeHeightStartingSampling +
+        ", precomputeEpsilon=" + precomputeEpsilon + ')';
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/UnknownDoubleQuantileEstimator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java Thu Dec 17 23:22:16 2009
@@ -0,0 +1,22 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.jet.stat.quantile;
+
+/** Holds some utility methods shared by different quantile finding implementations. */
+class Utils {
+
+  private Utils() {
+  }
+
+  /** Similar to Math.ceil(value), but adjusts small numerical rounding errors +- epsilon. */
+  public static long epsilonCeiling(double value) {
+    double epsilon = 0.0000001;
+    return (long) Math.ceil(value - epsilon);
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/Utils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html?rev=891983&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html Thu Dec 17 23:22:16 2009
@@ -0,0 +1,30 @@
+<HTML>
+<BODY>
+Scalable algorithms and data structures to compute approximate quantiles over very large data sequences.
+The approximation guarantees are explicit, and apply for arbitrary value distributions and arrival distributions of the
+dataset.
+The main memory requirements are smaller than for any other known technique by an order of magnitude.
+
+<p>The approx. algorithms are primarily intended to help applications scale.
+  When faced with a large data sequence, traditional methods either need very large memories or time consuming disk
+  based sorting.
+  In constrast, the approx. algorithms can deal with > 10^10 values without disk based sorting.
+
+<p>All classes can be seen from various angles, for example as
+  <dt>1. Algorithm to compute quantiles.
+  <dt>2. 1-dim-equi-depth histogram.
+  <dt>3. 1-dim-histogram arbitrarily rebinnable in real-time.
+  <dt>4. A space efficient MultiSet data structure using lossy compression.
+  <dt>5. A space efficient value preserving bin of a 2-dim or d-dim histogram.
+  <dt>(All subject to an accuracy specified by the user.)
+
+    Have a look at the documentation of class {@link org.apache.mahout.math.jet.stat.quantile.QuantileFinderFactory} and the
+    interface
+    {@link org.apache.mahout.math.jet.stat.quantile.DoubleQuantileFinder} to learn more.
+    Most users will never need to know more than how to use these.
+    Actual implementations of the <tt>QuantileFinder</tt> interface are hidden.
+    They are indirectly constructed via the the factory.
+    <br>
+    Also see {@link hep.aida.bin.QuantileBin1D}, demonstrating how this package can be used.
+</BODY>
+</HTML>

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/jet/stat/quantile/package.html
------------------------------------------------------------------------------
    svn:eol-style = native