You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/11/25 04:31:49 UTC
svn commit: r883972 [9/10] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/fpm/pfpgrowth/
matrix/src/main/java/org/apache/mahout/jet/math/
matrix/src/main/java/org/apache/mahout/jet/random/
matrix/src/main/java/org/apache/mahout/jet/ran...
Modified: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java?rev=883972&r1=883971&r2=883972&view=diff
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java (original)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java Wed Nov 25 03:31:47 2009
@@ -32,19 +32,17 @@
* Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data,
* Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
*
- * @author wolfgang.hoschek@cern.ch
- * @version 1.0, 09/24/99
* @see QuantileFinderFactory
* @see UnknownApproximateDoubleQuantileFinder
*/
class KnownDoubleQuantileEstimator extends DoubleQuantileEstimator {
- protected double beta; //correction factor for phis
+ protected double beta; //correction factor for phis
- protected boolean weHadMoreThanOneEmptyBuffer;
+ protected boolean weHadMoreThanOneEmptyBuffer;
- protected RandomSamplingAssistant samplingAssistant;
- protected double samplingRate; // see method sampleNextElement()
- protected long N; // see method sampleNextElement()
+ protected RandomSamplingAssistant samplingAssistant;
+ protected double samplingRate; // see method sampleNextElement()
+ protected long N; // see method sampleNextElement()
/**
* Constructs an approximate quantile finder with b buffers, each having k elements.
* @param b the number of buffers
@@ -54,68 +52,68 @@
* @param generator a uniform random number generator.
*/
public KnownDoubleQuantileEstimator(int b, int k, long N, double samplingRate, RandomEngine generator) {
- this.samplingRate = samplingRate;
- this.N = N;
+ this.samplingRate = samplingRate;
+ this.N = N;
- if (this.samplingRate <= 1.0) {
- this.samplingAssistant = null;
- }
- else {
- this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, generator);
- }
-
- setUp(b,k);
- this.clear();
+ if (this.samplingRate <= 1.0) {
+ this.samplingAssistant = null;
+ }
+ else {
+ this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, generator);
+ }
+
+ setUp(b,k);
+ this.clear();
}
/**
* @param infinities the number of infinities to fill.
* @param buffer the buffer into which the infinities shall be filled.
*/
protected void addInfinities(int missingInfinities, DoubleBuffer buffer) {
- RandomSamplingAssistant oldAssistant = this.samplingAssistant;
- this.samplingAssistant = null; // switch off sampler
- //double[] infinities = new double[missingInfinities];
-
- boolean even=true;
- for (int i=0; i<missingInfinities; i++) {
- if (even) buffer.values.add(Double.MAX_VALUE);
- else buffer.values.add(-Double.MAX_VALUE);
-
- //if (even) {infinities[i]=Double.MAX_VALUE;}
- //else {infinities[i]=-Double.MAX_VALUE;}
-
- //if (even) {this.add(Double.MAX_VALUE);}
- //else {this.add(-Double.MAX_VALUE);}
- even = !even;
- }
-
- //buffer.values.addAllOfFromTo(new DoubleArrayList(infinities),0,missingInfinities-1);
-
- //this.totalElementsFilled -= infinities;
-
- this.samplingAssistant = oldAssistant; // switch on sampler again
+ RandomSamplingAssistant oldAssistant = this.samplingAssistant;
+ this.samplingAssistant = null; // switch off sampler
+ //double[] infinities = new double[missingInfinities];
+
+ boolean even=true;
+ for (int i=0; i<missingInfinities; i++) {
+ if (even) buffer.values.add(Double.MAX_VALUE);
+ else buffer.values.add(-Double.MAX_VALUE);
+
+ //if (even) {infinities[i]=Double.MAX_VALUE;}
+ //else {infinities[i]=-Double.MAX_VALUE;}
+
+ //if (even) {this.add(Double.MAX_VALUE);}
+ //else {this.add(-Double.MAX_VALUE);}
+ even = !even;
+ }
+
+ //buffer.values.addAllOfFromTo(new DoubleArrayList(infinities),0,missingInfinities-1);
+
+ //this.totalElementsFilled -= infinities;
+
+ this.samplingAssistant = oldAssistant; // switch on sampler again
}
/**
* Not yet commented.
*/
protected DoubleBuffer[] buffersToCollapse() {
- int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
- return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+ int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
+ return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
}
/**
* Removes all elements from the receiver. The receiver will
* be empty after this call returns, and its memory requirements will be close to zero.
*/
public void clear() {
- super.clear();
- this.beta=1.0;
- this.weHadMoreThanOneEmptyBuffer = false;
- //this.setSamplingRate(samplingRate,N);
-
- RandomSamplingAssistant assist = this.samplingAssistant;
- if (assist != null) {
- this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, assist.getRandomGenerator());
- }
+ super.clear();
+ this.beta=1.0;
+ this.weHadMoreThanOneEmptyBuffer = false;
+ //this.setSamplingRate(samplingRate,N);
+
+ RandomSamplingAssistant assist = this.samplingAssistant;
+ if (assist != null) {
+ this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, assist.getRandomGenerator());
+ }
}
/**
* Returns a deep copy of the receiver.
@@ -123,50 +121,50 @@
* @return a deep copy of the receiver.
*/
public Object clone() {
- KnownDoubleQuantileEstimator copy = (KnownDoubleQuantileEstimator) super.clone();
- if (this.samplingAssistant != null) copy.samplingAssistant = (RandomSamplingAssistant) copy.samplingAssistant.clone();
- return copy;
+ KnownDoubleQuantileEstimator copy = (KnownDoubleQuantileEstimator) super.clone();
+ if (this.samplingAssistant != null) copy.samplingAssistant = (RandomSamplingAssistant) copy.samplingAssistant.clone();
+ return copy;
}
/**
* Not yet commented.
*/
protected void newBuffer() {
- int numberOfEmptyBuffers = this.bufferSet._getNumberOfEmptyBuffers();
- //DoubleBuffer[] emptyBuffers = this.bufferSet._getEmptyBuffers();
- if (numberOfEmptyBuffers==0) throw new RuntimeException("Oops, no empty buffer.");
-
- this.currentBufferToFill = this.bufferSet._getFirstEmptyBuffer();
- if (numberOfEmptyBuffers==1 && ! this.weHadMoreThanOneEmptyBuffer) {
- this.currentBufferToFill.level(this.bufferSet._getMinLevelOfFullOrPartialBuffers());
- }
- else {
- this.weHadMoreThanOneEmptyBuffer = true;
- this.currentBufferToFill.level(0);
- /*
- for (int i=0; i<emptyBuffers.length; i++) {
- emptyBuffers[i].level = 0;
- }
- */
- }
- //currentBufferToFill.state = DoubleBuffer.PARTIAL;
- this.currentBufferToFill.weight(1);
+ int numberOfEmptyBuffers = this.bufferSet._getNumberOfEmptyBuffers();
+ //DoubleBuffer[] emptyBuffers = this.bufferSet._getEmptyBuffers();
+ if (numberOfEmptyBuffers==0) throw new RuntimeException("Oops, no empty buffer.");
+
+ this.currentBufferToFill = this.bufferSet._getFirstEmptyBuffer();
+ if (numberOfEmptyBuffers==1 && ! this.weHadMoreThanOneEmptyBuffer) {
+ this.currentBufferToFill.level(this.bufferSet._getMinLevelOfFullOrPartialBuffers());
+ }
+ else {
+ this.weHadMoreThanOneEmptyBuffer = true;
+ this.currentBufferToFill.level(0);
+ /*
+ for (int i=0; i<emptyBuffers.length; i++) {
+ emptyBuffers[i].level = 0;
+ }
+ */
+ }
+ //currentBufferToFill.state = DoubleBuffer.PARTIAL;
+ this.currentBufferToFill.weight(1);
}
/**
* Not yet commented.
*/
protected void postCollapse(DoubleBuffer[] toCollapse) {
- this.weHadMoreThanOneEmptyBuffer = false;
+ this.weHadMoreThanOneEmptyBuffer = false;
}
/**
*/
protected DoubleArrayList preProcessPhis(DoubleArrayList phis) {
- if (beta>1.0) {
- phis = phis.copy();
- for (int i=phis.size(); --i >=0;) {
- phis.set(i, (2*phis.get(i) + beta - 1) / (2*beta));
- }
- }
- return phis;
+ if (beta>1.0) {
+ phis = phis.copy();
+ for (int i=phis.size(); --i >=0;) {
+ phis.set(i, (2*phis.get(i) + beta - 1) / (2*beta));
+ }
+ }
+ return phis;
}
/**
* Computes the specified quantile elements over the values previously added.
@@ -174,41 +172,41 @@
* @return the approximate quantile elements.
*/
public DoubleArrayList quantileElements(DoubleArrayList phis) {
- /*
- * The KNOWN quantile finder reads off quantiles from FULL buffers only.
- * Since there might be a partially full buffer, this method first satisfies this constraint by temporarily filling a few +infinity, -infinity elements to make up a full block.
- * This is in full conformance with the explicit approximation guarantees.
- *
- * For those of you working on online apps:
- * The approximation guarantees are given for computing quantiles AFTER N elements have been filled, not for intermediate displays.
- * If you have one thread filling and another thread displaying concurrently, you will note that in the very beginning the infinities will dominate the display.
- * This could confuse users, because, of course, they don't expect any infinities, even if they "disappear" after a short while.
- * To prevent panic exclude phi's close to zero or one in the early phases of processing.
- */
- DoubleBuffer partial = this.bufferSet._getPartialBuffer();
- int missingValues = 0;
- if (partial != null) { // any auxiliary infinities needed?
- missingValues = bufferSet.k() - partial.size();
- if (missingValues <= 0) throw new RuntimeException("Oops! illegal missing values.");
-
- //System.out.println("adding "+missingValues+" infinity elements...");
- this.addInfinities(missingValues,partial);
-
- //determine beta (N + Infinity values = beta * N)
- this.beta = (this.totalElementsFilled + missingValues) / (double)this.totalElementsFilled;
- }
- else {
- this.beta=1.0;
- }
-
- DoubleArrayList quantileElements = super.quantileElements(phis);
-
- // restore state we were in before.
- // remove the temporarily added infinities.
- if (partial != null) removeInfinitiesFrom(missingValues, partial);
+ /*
+ * The KNOWN quantile finder reads off quantiles from FULL buffers only.
+ * Since there might be a partially full buffer, this method first satisfies this constraint by temporarily filling a few +infinity, -infinity elements to make up a full block.
+ * This is in full conformance with the explicit approximation guarantees.
+ *
+ * For those of you working on online apps:
+ * The approximation guarantees are given for computing quantiles AFTER N elements have been filled, not for intermediate displays.
+ * If you have one thread filling and another thread displaying concurrently, you will note that in the very beginning the infinities will dominate the display.
+ * This could confuse users, because, of course, they don't expect any infinities, even if they "disappear" after a short while.
+ * To prevent panic exclude phi's close to zero or one in the early phases of processing.
+ */
+ DoubleBuffer partial = this.bufferSet._getPartialBuffer();
+ int missingValues = 0;
+ if (partial != null) { // any auxiliary infinities needed?
+ missingValues = bufferSet.k() - partial.size();
+ if (missingValues <= 0) throw new RuntimeException("Oops! illegal missing values.");
+
+ //System.out.println("adding "+missingValues+" infinity elements...");
+ this.addInfinities(missingValues,partial);
+
+ //determine beta (N + Infinity values = beta * N)
+ this.beta = (this.totalElementsFilled + missingValues) / (double)this.totalElementsFilled;
+ }
+ else {
+ this.beta=1.0;
+ }
+
+ DoubleArrayList quantileElements = super.quantileElements(phis);
+
+ // restore state we were in before.
+ // remove the temporarily added infinities.
+ if (partial != null) removeInfinitiesFrom(missingValues, partial);
- // now you can continue filling the remaining values, if any.
- return quantileElements;
+ // now you can continue filling the remaining values, if any.
+ return quantileElements;
}
/**
* Reading off quantiles requires to fill some +infinity, -infinity values to make a partial buffer become full.
@@ -220,42 +218,42 @@
* @param buffer the buffer into which the infinities were filled.
*/
protected void removeInfinitiesFrom(int infinities, DoubleBuffer buffer) {
- int plusInf = 0;
- int minusInf = 0;
- // count them (this is not very clever but it's safe)
- boolean even=true;
- for (int i=0; i<infinities; i++) {
- if (even) plusInf++;
- else minusInf++;
- even = !even;
- }
-
- buffer.values.removeFromTo(buffer.size()-plusInf, buffer.size()-1);
- buffer.values.removeFromTo(0, minusInf-1);
- //this.totalElementsFilled -= infinities;
+ int plusInf = 0;
+ int minusInf = 0;
+ // count them (this is not very clever but it's safe)
+ boolean even=true;
+ for (int i=0; i<infinities; i++) {
+ if (even) plusInf++;
+ else minusInf++;
+ even = !even;
+ }
+
+ buffer.values.removeFromTo(buffer.size()-plusInf, buffer.size()-1);
+ buffer.values.removeFromTo(0, minusInf-1);
+ //this.totalElementsFilled -= infinities;
}
/**
* Not yet commented.
*/
protected boolean sampleNextElement() {
- if (samplingAssistant == null) return true;
+ if (samplingAssistant == null) return true;
- /*
- * This is a KNOWN N quantile finder!
- * One should not try to fill more than N elements,
- * because otherwise we can't give explicit approximation guarantees anymore.
- * Use an UNKNOWN quantile finder instead if your app may fill more than N elements.
- *
- * However, to make this class meaningful even under wired use cases, we actually do allow to fill more than N elements (without explicit approx. guarantees, of course).
- * Normally, elements beyond N will not get sampled because the sampler is exhausted.
- * Therefore the histogram will no more change no matter how much you fill.
- * This might not be what the user expects.
- * Therefore we use a new (unexhausted) sampler with the same parametrization.
- *
- * If you want this class to ignore any elements beyong N, then comment the following line.
- */
- //if ((totalElementsFilled-1) % N == 0) setSamplingRate(samplingRate, N); // delete if appropriate
+ /*
+ * This is a KNOWN N quantile finder!
+ * One should not try to fill more than N elements,
+ * because otherwise we can't give explicit approximation guarantees anymore.
+ * Use an UNKNOWN quantile finder instead if your app may fill more than N elements.
+ *
+ * However, to make this class meaningful even under wired use cases, we actually do allow to fill more than N elements (without explicit approx. guarantees, of course).
+ * Normally, elements beyond N will not get sampled because the sampler is exhausted.
+ * Therefore the histogram will no more change no matter how much you fill.
+ * This might not be what the user expects.
+ * Therefore we use a new (unexhausted) sampler with the same parametrization.
+ *
+ * If you want this class to ignore any elements beyong N, then comment the following line.
+ */
+ //if ((totalElementsFilled-1) % N == 0) setSamplingRate(samplingRate, N); // delete if appropriate
- return samplingAssistant.sampleNextElement();
+ return samplingAssistant.sampleNextElement();
}
}
Modified: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java?rev=883972&r1=883971&r2=883972&view=diff
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java (original)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java Wed Nov 25 03:31:47 2009
@@ -17,82 +17,82 @@
@Deprecated
public class Quantile1Test {
public static void main(String[] argv) {
- /*
- * Get the number of examples from the first argument
- */
- int numExamples = 0;
- try {
- numExamples = Integer.parseInt(argv[0]);
- }
- catch (Exception e) {
- System.err.println("Unable to parse input line count argument");
- System.err.println(e.getMessage());
- System.exit(1);
- }
- System.out.println("Got numExamples=" + numExamples);
-
- /*
- * Get N from the second argument
- */
- long N = 0;
- try {
- if (argv[1].equals("L")) {
- N = Long.MAX_VALUE;
- } else if (argv[1].equals("I")) {
- N = (long)Integer.MAX_VALUE;
- } else {
- N = Long.parseLong(argv[1]);
- }
- }
- catch (Exception e) {
- System.err.println("Error parsing flag for N");
- System.err.println(e.getMessage());
- System.exit(1);
- }
- System.out.println("Got N=" + N);
-
- /*
- * Set up the QuantileBin1D object
- *
- DRand rand = new DRand(new Date());
- QuantileBin1D qAccum = new QuantileBin1D(false,
- N,
- 1.e-4,
- 1.e-3,
- 200,
- rand,
- false,
- false,
- 2);
-
- DynamicBin1D dbin = new DynamicBin1D();
-
- *
- * Use a new random number generator to generate numExamples
- * random gaussians, and add them to the QuantileBin1D
- *
- Uniform dataRand = new Uniform(new DRand(7757));
- for (int i = 1; i <= numExamples; i++) {
- double gauss = dataRand.nextDouble();
- qAccum.add(gauss);
- dbin.add(gauss);
- }
-
- *
- * print out the percentiles
- *
- DecimalFormat fmt = new DecimalFormat("0.00");
- System.out.println();
- //int step = 1;
- int step = 10;
- for (int i = 1; i < 100; ) {
- double percent = ((double)i) * 0.01;
- double quantile = qAccum.quantile(percent);
- System.out.println(fmt.format(percent) + " " + quantile + ", " + dbin.quantile(percent)+ ", " + (dbin.quantile(percent)-quantile));
- i = i + step;
- *
- }
- */
+ /*
+ * Get the number of examples from the first argument
+ */
+ int numExamples = 0;
+ try {
+ numExamples = Integer.parseInt(argv[0]);
+ }
+ catch (Exception e) {
+ System.err.println("Unable to parse input line count argument");
+ System.err.println(e.getMessage());
+ System.exit(1);
+ }
+ System.out.println("Got numExamples=" + numExamples);
+
+ /*
+ * Get N from the second argument
+ */
+ long N = 0;
+ try {
+ if (argv[1].equals("L")) {
+ N = Long.MAX_VALUE;
+ } else if (argv[1].equals("I")) {
+ N = (long)Integer.MAX_VALUE;
+ } else {
+ N = Long.parseLong(argv[1]);
+ }
+ }
+ catch (Exception e) {
+ System.err.println("Error parsing flag for N");
+ System.err.println(e.getMessage());
+ System.exit(1);
+ }
+ System.out.println("Got N=" + N);
+
+ /*
+ * Set up the QuantileBin1D object
+ *
+ DRand rand = new DRand(new Date());
+ QuantileBin1D qAccum = new QuantileBin1D(false,
+ N,
+ 1.e-4,
+ 1.e-3,
+ 200,
+ rand,
+ false,
+ false,
+ 2);
+
+ DynamicBin1D dbin = new DynamicBin1D();
+
+ *
+ * Use a new random number generator to generate numExamples
+ * random gaussians, and add them to the QuantileBin1D
+ *
+ Uniform dataRand = new Uniform(new DRand(7757));
+ for (int i = 1; i <= numExamples; i++) {
+ double gauss = dataRand.nextDouble();
+ qAccum.add(gauss);
+ dbin.add(gauss);
+ }
+
+ *
+ * print out the percentiles
+ *
+ DecimalFormat fmt = new DecimalFormat("0.00");
+ System.out.println();
+ //int step = 1;
+ int step = 10;
+ for (int i = 1; i < 100; ) {
+ double percent = ((double)i) * 0.01;
+ double quantile = qAccum.quantile(percent);
+ System.out.println(fmt.format(percent) + " " + quantile + ", " + dbin.quantile(percent)+ ", " + (dbin.quantile(percent)-quantile));
+ i = i + step;
+ *
+ }
+ */
}
}
Modified: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java?rev=883972&r1=883971&r2=883972&view=diff
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java (original)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java Wed Nov 25 03:31:47 2009
@@ -18,19 +18,19 @@
* Tries to avoid numeric overflows.
* @return the binomial coefficient.
*/
-public static double binomial(long n, long k) {
- if (k==0 || k==n) {return 1.0;}
+public static double binomial(long n, long k) {
+ if (k==0 || k==n) {return 1.0;}
- // since binomial(n,k)==binomial(n,n-k), we can enforce the faster variant,
- // which is also the variant minimizing number overflows.
- if (k>n/2.0) k=n-k;
-
- double binomial=1.0;
- long N=n-k+1;
- for (long i=k; i>0; ) {
- binomial *= ((double) N++) / (double) (i--);
- }
- return binomial;
+ // since binomial(n,k)==binomial(n,n-k), we can enforce the faster variant,
+ // which is also the variant minimizing number overflows.
+ if (k>n/2.0) k=n-k;
+
+ double binomial=1.0;
+ long N=n-k+1;
+ for (long i=k; i>0; ) {
+ binomial *= ((double) N++) / (double) (i--);
+ }
+ return binomial;
}
/**
* Returns the smallest <code>long >= value</code>.
@@ -38,7 +38,7 @@
* This method is safer than using (long) Math.ceil(value), because of possible rounding error.
*/
public static long ceiling(double value) {
- return Math.round(Math.ceil(value));
+ return Math.round(Math.ceil(value));
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -55,11 +55,11 @@
* @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
*/
public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
- if (delta > 0.0) {
- return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
- }
- returnSamplingRate[0] = 1.0;
- return known_N_compute_B_and_K_quick(N, epsilon);
+ if (delta > 0.0) {
+ return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+ }
+ returnSamplingRate[0] = 1.0;
+ return known_N_compute_B_and_K_quick(N, epsilon);
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -70,100 +70,100 @@
* @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
*/
protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
- if (epsilon<=0.0) {
- // no way around exact quantile search
- long[] result = new long[2];
- result[0]=1;
- result[1]=N;
- return result;
- }
-
- final int maxBuffers = 50;
- final int maxHeight = 50;
- final double N_double = (double) N;
- final double c = N_double * epsilon * 2.0;
- int[] heightMaximums = new int[maxBuffers-1];
-
- // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
- // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
- for (int b=2; b<=maxBuffers; b++) {
- int h = 3;
-
- while ( h<=maxHeight && // skip heights until x<=0
- (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
- ((double) Math.round(binomial(b+h-3, h-3))) +
- ((double) Math.round(binomial(b+h-3, h-2))) - c
- > 0.0
- ) {h++;}
- //from now on x is monotonically growing...
- while ( h<=maxHeight && // skip heights until x>0
- (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
- ((double) Math.round(binomial(b+h-3, h-3))) +
- ((double) Math.round(binomial(b+h-3, h-2))) - c
- <= 0.0
- ) {h++;}
- h--; //go back to last height
-
- // was x>0 or did we loop without finding anything?
- int hMax;
- if (h>=maxHeight &&
- (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
- ((double) Math.round(binomial(b+h-3, h-3))) +
- ((double) Math.round(binomial(b+h-3, h-2))) - c
- > 0.0) {
- hMax=Integer.MIN_VALUE;
- }
- else {
- hMax=h;
- }
-
- heightMaximums[b-2]=hMax; //safe some space
- } //end for
-
-
- // for each b, determine the smallest k satisfying the constraints, i.e.
- // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
- long[] kMinimums = new long[maxBuffers-1];
- for (int b=2; b<=maxBuffers; b++) {
- int h=heightMaximums[b-2];
- long kMin=Long.MAX_VALUE;
- if (h>Integer.MIN_VALUE) {
- double value = ((double)Math.round(binomial(b+h-2, h-1)));
- long tmpK=ceiling(N_double/value);
- if (tmpK<=Long.MAX_VALUE) {
- kMin = tmpK;
- }
- }
- kMinimums[b-2]=kMin;
- }
-
- // from all b's, determine b that minimizes b*kMin
- long multMin = Long.MAX_VALUE;
- int minB = -1;
- for (int b=2; b<=maxBuffers; b++) {
- if (kMinimums[b-2]<Long.MAX_VALUE) {
- long mult = ((long)b) * ((long)kMinimums[b-2]);
- if (mult<multMin) {
- multMin=mult;
- minB = b;
- }
- }
- }
-
- long b, k;
- if (minB != -1) { // epsilon large enough?
- b = minB;
- k = kMinimums[minB-2];
- }
- else { // epsilon is very small or zero.
- b = 1; // the only possible solution without violating the
- k = N; // approximation guarantees is exact quantile search.
- }
-
- long[] result = new long[2];
- result[0]=b;
- result[1]=k;
- return result;
+ if (epsilon<=0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0]=1;
+ result[1]=N;
+ return result;
+ }
+
+ final int maxBuffers = 50;
+ final int maxHeight = 50;
+ final double N_double = (double) N;
+ final double c = N_double * epsilon * 2.0;
+ int[] heightMaximums = new int[maxBuffers-1];
+
+ // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+ // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+ for (int b=2; b<=maxBuffers; b++) {
+ int h = 3;
+
+ while ( h<=maxHeight && // skip heights until x<=0
+ (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+ ((double) Math.round(binomial(b+h-3, h-3))) +
+ ((double) Math.round(binomial(b+h-3, h-2))) - c
+ > 0.0
+ ) {h++;}
+ //from now on x is monotonically growing...
+ while ( h<=maxHeight && // skip heights until x>0
+ (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+ ((double) Math.round(binomial(b+h-3, h-3))) +
+ ((double) Math.round(binomial(b+h-3, h-2))) - c
+ <= 0.0
+ ) {h++;}
+ h--; //go back to last height
+
+ // was x>0 or did we loop without finding anything?
+ int hMax;
+ if (h>=maxHeight &&
+ (h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+ ((double) Math.round(binomial(b+h-3, h-3))) +
+ ((double) Math.round(binomial(b+h-3, h-2))) - c
+ > 0.0) {
+ hMax=Integer.MIN_VALUE;
+ }
+ else {
+ hMax=h;
+ }
+
+ heightMaximums[b-2]=hMax; //safe some space
+ } //end for
+
+
+ // for each b, determine the smallest k satisfying the constraints, i.e.
+ // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+ long[] kMinimums = new long[maxBuffers-1];
+ for (int b=2; b<=maxBuffers; b++) {
+ int h=heightMaximums[b-2];
+ long kMin=Long.MAX_VALUE;
+ if (h>Integer.MIN_VALUE) {
+ double value = ((double)Math.round(binomial(b+h-2, h-1)));
+ long tmpK=ceiling(N_double/value);
+ if (tmpK<=Long.MAX_VALUE) {
+ kMin = tmpK;
+ }
+ }
+ kMinimums[b-2]=kMin;
+ }
+
+ // from all b's, determine b that minimizes b*kMin
+ long multMin = Long.MAX_VALUE;
+ int minB = -1;
+ for (int b=2; b<=maxBuffers; b++) {
+ if (kMinimums[b-2]<Long.MAX_VALUE) {
+ long mult = ((long)b) * ((long)kMinimums[b-2]);
+ if (mult<multMin) {
+ multMin=mult;
+ minB = b;
+ }
+ }
+ }
+
+ long b, k;
+ if (minB != -1) { // epsilon large enough?
+ b = minB;
+ k = kMinimums[minB-2];
+ }
+ else { // epsilon is very small or zero.
+ b = 1; // the only possible solution without violating the
+ k = N; // approximation guarantees is exact quantile search.
+ }
+
+ long[] result = new long[2];
+ result[0]=b;
+ result[1]=k;
+ return result;
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -178,149 +178,149 @@
* @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
*/
protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
- // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
- if (epsilon<=0.0) {
- // no way around exact quantile search
- long[] result = new long[2];
- result[0]=1;
- result[1]=N;
- returnSamplingRate[0]=1.0;
- return result;
- }
-
-
- final int maxBuffers = 50;
- final int maxHeight = 50;
- final double N_double = N;
-
- // One possibility is to use one buffer of size N
- //
- long ret_b = 1;
- long ret_k = N;
- double sampling_rate = 1.0;
- long memory = N;
-
-
- // Otherwise, there are at least two buffers (b >= 2)
- // and the height of the tree is at least three (h >= 3)
- //
- // We restrict the search for b and h to MAX_BINOM, a large enough value for
- // practical values of epsilon >= 0.001 and delta >= 0.00001
- //
- final double logarithm = Math.log(2.0*quantiles/delta);
- final double c = 2.0 * epsilon * N_double;
- for (long b=2 ; b<maxBuffers ; b++)
- for (long h=3 ; h<maxHeight ; h++) {
- double binomial = binomial(b+h-2, h-1);
- long tmp = ceiling(N_double / binomial);
- if ((b * tmp < memory) &&
- ((h-2) * binomial - binomial(b+h-3, h-3) + binomial(b+h-3, h-2)
- <= c) ) {
- ret_k = tmp ;
- ret_b = b ;
- memory = ret_k * b;
- sampling_rate = 1.0 ;
- }
- if (delta > 0.0) {
- double t = (h-2) * binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) ;
- double u = logarithm / epsilon ;
- double v = binomial (b+h-2, h-1) ;
- double w = logarithm / (2.0*epsilon*epsilon) ;
-
- // From our SIGMOD 98 paper, we have two equantions to satisfy:
- // t <= u * alpha/(1-alpha)^2
- // kv >= w/(1-alpha)^2
- //
- // Denoting 1/(1-alpha) by x,
- // we see that the first inequality is equivalent to
- // t/u <= x^2 - x
- // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
- // Plugging in this value into second equation yields
- // k >= wx^2/v
-
- double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0*t/u) ;
- long k = ceiling(w*x*x/v) ;
- if (b * k < memory) {
- ret_k = k ;
- ret_b = b ;
- memory = b * k ;
- sampling_rate = N_double*2.0*epsilon*epsilon / logarithm ;
- }
- }
- }
-
- long[] result = new long[2];
- result[0]=ret_b;
- result[1]=ret_k;
- returnSamplingRate[0]=sampling_rate;
- return result;
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon<=0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0]=1;
+ result[1]=N;
+ returnSamplingRate[0]=1.0;
+ return result;
+ }
+
+
+ final int maxBuffers = 50;
+ final int maxHeight = 50;
+ final double N_double = N;
+
+ // One possibility is to use one buffer of size N
+ //
+ long ret_b = 1;
+ long ret_k = N;
+ double sampling_rate = 1.0;
+ long memory = N;
+
+
+ // Otherwise, there are at least two buffers (b >= 2)
+ // and the height of the tree is at least three (h >= 3)
+ //
+ // We restrict the search for b and h to MAX_BINOM, a large enough value for
+ // practical values of epsilon >= 0.001 and delta >= 0.00001
+ //
+ final double logarithm = Math.log(2.0*quantiles/delta);
+ final double c = 2.0 * epsilon * N_double;
+ for (long b=2 ; b<maxBuffers ; b++)
+ for (long h=3 ; h<maxHeight ; h++) {
+ double binomial = binomial(b+h-2, h-1);
+ long tmp = ceiling(N_double / binomial);
+ if ((b * tmp < memory) &&
+ ((h-2) * binomial - binomial(b+h-3, h-3) + binomial(b+h-3, h-2)
+ <= c) ) {
+ ret_k = tmp ;
+ ret_b = b ;
+ memory = ret_k * b;
+ sampling_rate = 1.0 ;
+ }
+ if (delta > 0.0) {
+ double t = (h-2) * binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) ;
+ double u = logarithm / epsilon ;
+ double v = binomial (b+h-2, h-1) ;
+ double w = logarithm / (2.0*epsilon*epsilon) ;
+
+ // From our SIGMOD 98 paper, we have two equantions to satisfy:
+ // t <= u * alpha/(1-alpha)^2
+ // kv >= w/(1-alpha)^2
+ //
+ // Denoting 1/(1-alpha) by x,
+ // we see that the first inequality is equivalent to
+ // t/u <= x^2 - x
+ // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+ // Plugging in this value into second equation yields
+ // k >= wx^2/v
+
+ double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0*t/u) ;
+ long k = ceiling(w*x*x/v) ;
+ if (b * k < memory) {
+ ret_k = k ;
+ ret_b = b ;
+ memory = b * k ;
+ sampling_rate = N_double*2.0*epsilon*epsilon / logarithm ;
+ }
+ }
+ }
+
+ long[] result = new long[2];
+ result[0]=ret_b;
+ result[1]=ret_k;
+ returnSamplingRate[0]=sampling_rate;
+ return result;
}
public static void main(String[] args) {
- test_B_and_K_Calculation(args);
+ test_B_and_K_Calculation(args);
}
/**
* Computes b and k for different parameters.
*/
public static void test_B_and_K_Calculation(String[] args) {
- boolean known_N;
- if (args==null) known_N = false;
- else known_N = new Boolean(args[0]).booleanValue();
-
- int[] quantiles = {1,1000};
-
- long[] sizes = {100000, 1000000, 10000000, 1000000000};
-
- double[] deltas = {0.0, 0.001, 0.0001, 0.00001};
-
- double[] epsilons = {0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000001};
-
-
-
- if (! known_N) sizes = new long[] {0};
- System.out.println("\n\n");
- if (known_N)
- System.out.println("Computing b's and k's for KNOWN N");
- else
- System.out.println("Computing b's and k's for UNKNOWN N");
- System.out.println("mem [elements/1024]");
- System.out.println("***********************************");
-
- for (int q=0; q<quantiles.length; q++) {
- int p = quantiles[q];
- System.out.println("------------------------------");
- System.out.println("computing for p = "+p);
- for (int s=0; s<sizes.length; s++) {
- long N = sizes[s];
- System.out.println(" ------------------------------");
- System.out.println(" computing for N = "+N);
- for (int d=0; d<deltas.length; d++) {
- double delta = deltas[d];
- System.out.println(" ------------------------------");
- System.out.println(" computing for delta = "+delta);
- for (int e=0; e<epsilons.length; e++) {
- double epsilon = epsilons[e];
-
- double[] returnSamplingRate = new double[1];
- long[] result;
- if (known_N) {
- result = known_N_compute_B_and_K(N, epsilon, delta, p, returnSamplingRate);
- }
- else {
- result = unknown_N_compute_B_and_K(epsilon, delta, p);
- }
-
- long b = result[0];
- long k = result[1];
- System.out.print(" (e,d,N,p)=("+epsilon+","+delta+","+N+","+p+") --> ");
- System.out.print("(b,k,mem");
- if (known_N) System.out.print(",sampling");
- System.out.print(")=("+b+","+k+","+(b*k/1024));
- if (known_N) System.out.print(","+returnSamplingRate[0]);
- System.out.println(")");
- }
- }
- }
- }
+ boolean known_N;
+ if (args==null) known_N = false;
+ else known_N = new Boolean(args[0]).booleanValue();
+
+ int[] quantiles = {1,1000};
+
+ long[] sizes = {100000, 1000000, 10000000, 1000000000};
+
+ double[] deltas = {0.0, 0.001, 0.0001, 0.00001};
+
+ double[] epsilons = {0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000001};
+
+
+
+ if (! known_N) sizes = new long[] {0};
+ System.out.println("\n\n");
+ if (known_N)
+ System.out.println("Computing b's and k's for KNOWN N");
+ else
+ System.out.println("Computing b's and k's for UNKNOWN N");
+ System.out.println("mem [elements/1024]");
+ System.out.println("***********************************");
+
+ for (int q=0; q<quantiles.length; q++) {
+ int p = quantiles[q];
+ System.out.println("------------------------------");
+ System.out.println("computing for p = "+p);
+ for (int s=0; s<sizes.length; s++) {
+ long N = sizes[s];
+ System.out.println(" ------------------------------");
+ System.out.println(" computing for N = "+N);
+ for (int d=0; d<deltas.length; d++) {
+ double delta = deltas[d];
+ System.out.println(" ------------------------------");
+ System.out.println(" computing for delta = "+delta);
+ for (int e=0; e<epsilons.length; e++) {
+ double epsilon = epsilons[e];
+
+ double[] returnSamplingRate = new double[1];
+ long[] result;
+ if (known_N) {
+ result = known_N_compute_B_and_K(N, epsilon, delta, p, returnSamplingRate);
+ }
+ else {
+ result = unknown_N_compute_B_and_K(epsilon, delta, p);
+ }
+
+ long b = result[0];
+ long k = result[1];
+ System.out.print(" (e,d,N,p)=("+epsilon+","+delta+","+N+","+p+") --> ");
+ System.out.print("(b,k,mem");
+ if (known_N) System.out.print(",sampling");
+ System.out.print(")=("+b+","+k+","+(b*k/1024));
+ if (known_N) System.out.print(","+returnSamplingRate[0]);
+ System.out.println(")");
+ }
+ }
+ }
+ }
}
/**
@@ -333,117 +333,117 @@
* @return <tt>long[3]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>long[2]</tt>=the tree height where sampling shall start.
*/
public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
- // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
- if (epsilon<=0.0 || delta<=0.0) {
- // no way around exact quantile search
- long[] result = new long[3];
- result[0]=1;
- result[1]=Long.MAX_VALUE;
- result[2]=Long.MAX_VALUE;
- return result;
- }
-
- int max_b = 50;
- int max_h = 50;
- int max_H = 50;
- int max_Iterations = 2;
-
- long best_b = Long.MAX_VALUE;
- long best_k = Long.MAX_VALUE;
- long best_h = Long.MAX_VALUE;
- long best_memory = Long.MAX_VALUE;
-
- double pow = Math.pow(2.0, max_H);
- double logDelta = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon);
- //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
-
- while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
- // identify that combination of b and h that minimizes b*k.
- // exhaustive search.
- for (int b=2; b<=max_b; b++) {
- for (int h=2; h<=max_h; h++) {
- double Ld = binomial(b+h-2, h-1);
- double Ls = binomial(b+h-3,h-1);
-
- // now we have k>=c*(1-alpha)^-2.
- // let's compute c.
- //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
- double c = logDelta / Math.min(Ld, 8.0*Ls/3.0);
-
- // now we have k>=d/alpha.
- // let's compute d.
- double beta = Ld/Ls;
- double cc = (beta-2.0)*(max_H-2.0) / (beta + pow - 2.0);
- double d = (h+3+cc) / (2.0*epsilon);
-
- /*
- double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
- d = (d + 2.0) / (2.0*epsilon);
- */
-
- // now we have c*(1-alpha)^-2 == d/alpha.
- // we solve this equation for alpha yielding two solutions
- // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
- double f = c*c + 4.0*c*d;
- if (f<0.0) continue; // non real solution to equation
- double root = Math.sqrt(f);
- double alpha_one = (c + 2.0*d + root)/(2.0*d);
- double alpha_two = (c + 2.0*d - root)/(2.0*d);
-
- // any alpha must satisfy 0<alpha<1 to yield valid solutions
- boolean alpha_one_OK = false;
- boolean alpha_two_OK = false;
- if (0.0 < alpha_one && alpha_one < 1.0) alpha_one_OK = true;
- if (0.0 < alpha_two && alpha_two < 1.0) alpha_two_OK = true;
- if (alpha_one_OK || alpha_two_OK) {
- double alpha = alpha_one;
- if (alpha_one_OK && alpha_two_OK) {
- // take the alpha that minimizes d/alpha
- alpha = Math.max(alpha_one, alpha_two);
- }
- else if (alpha_two_OK) {
- alpha = alpha_two;
- }
-
- // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
- long k = ceiling(Math.max(d/alpha, (h+1)/(2.0*epsilon)));
- if (k>0) { // valid solution?
- long memory = b*k;
- if (memory < best_memory) {
- // found a solution requiring less memory
- best_k = k;
- best_b = b;
- best_h = h;
- best_memory = memory;
- }
- }
- }
- } //end for h
- } //end for b
-
- if (best_b == Long.MAX_VALUE) {
- System.out.println("Warning: Computing b and k looks like a lot of work!");
- // no solution found so far. very unlikely. Anyway, try again.
- max_b *= 2;
- max_h *= 2;
- max_H *= 2;
- }
- } //end while
-
- long[] result = new long[3];
- if (best_b == Long.MAX_VALUE) {
- // no solution found.
- // no way around exact quantile search.
- result[0]=1;
- result[1]=Long.MAX_VALUE;
- result[2]=Long.MAX_VALUE;
- }
- else {
- result[0]=best_b;
- result[1]=best_k;
- result[2]=best_h;
- }
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon<=0.0 || delta<=0.0) {
+ // no way around exact quantile search
+ long[] result = new long[3];
+ result[0]=1;
+ result[1]=Long.MAX_VALUE;
+ result[2]=Long.MAX_VALUE;
+ return result;
+ }
+
+ int max_b = 50;
+ int max_h = 50;
+ int max_H = 50;
+ int max_Iterations = 2;
+
+ long best_b = Long.MAX_VALUE;
+ long best_k = Long.MAX_VALUE;
+ long best_h = Long.MAX_VALUE;
+ long best_memory = Long.MAX_VALUE;
+
+ double pow = Math.pow(2.0, max_H);
+ double logDelta = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon);
+ //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+ while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+ // identify that combination of b and h that minimizes b*k.
+ // exhaustive search.
+ for (int b=2; b<=max_b; b++) {
+ for (int h=2; h<=max_h; h++) {
+ double Ld = binomial(b+h-2, h-1);
+ double Ls = binomial(b+h-3,h-1);
+
+ // now we have k>=c*(1-alpha)^-2.
+ // let's compute c.
+ //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+ double c = logDelta / Math.min(Ld, 8.0*Ls/3.0);
+
+ // now we have k>=d/alpha.
+ // let's compute d.
+ double beta = Ld/Ls;
+ double cc = (beta-2.0)*(max_H-2.0) / (beta + pow - 2.0);
+ double d = (h+3+cc) / (2.0*epsilon);
+
+ /*
+ double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
+ d = (d + 2.0) / (2.0*epsilon);
+ */
+
+ // now we have c*(1-alpha)^-2 == d/alpha.
+ // we solve this equation for alpha yielding two solutions
+ // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
+ double f = c*c + 4.0*c*d;
+ if (f<0.0) continue; // non real solution to equation
+ double root = Math.sqrt(f);
+ double alpha_one = (c + 2.0*d + root)/(2.0*d);
+ double alpha_two = (c + 2.0*d - root)/(2.0*d);
+
+ // any alpha must satisfy 0<alpha<1 to yield valid solutions
+ boolean alpha_one_OK = false;
+ boolean alpha_two_OK = false;
+ if (0.0 < alpha_one && alpha_one < 1.0) alpha_one_OK = true;
+ if (0.0 < alpha_two && alpha_two < 1.0) alpha_two_OK = true;
+ if (alpha_one_OK || alpha_two_OK) {
+ double alpha = alpha_one;
+ if (alpha_one_OK && alpha_two_OK) {
+ // take the alpha that minimizes d/alpha
+ alpha = Math.max(alpha_one, alpha_two);
+ }
+ else if (alpha_two_OK) {
+ alpha = alpha_two;
+ }
+
+ // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+ long k = ceiling(Math.max(d/alpha, (h+1)/(2.0*epsilon)));
+ if (k>0) { // valid solution?
+ long memory = b*k;
+ if (memory < best_memory) {
+ // found a solution requiring less memory
+ best_k = k;
+ best_b = b;
+ best_h = h;
+ best_memory = memory;
+ }
+ }
+ }
+ } //end for h
+ } //end for b
+
+ if (best_b == Long.MAX_VALUE) {
+ System.out.println("Warning: Computing b and k looks like a lot of work!");
+ // no solution found so far. very unlikely. Anyway, try again.
+ max_b *= 2;
+ max_h *= 2;
+ max_H *= 2;
+ }
+ } //end while
+
+ long[] result = new long[3];
+ if (best_b == Long.MAX_VALUE) {
+ // no solution found.
+ // no way around exact quantile search.
+ result[0]=1;
+ result[1]=Long.MAX_VALUE;
+ result[2]=Long.MAX_VALUE;
+ }
+ else {
+ result[0]=best_b;
+ result[1]=best_k;
+ result[2]=best_h;
+ }
- return result;
+ return result;
}
}
Modified: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileFinderFactory.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileFinderFactory.java?rev=883972&r1=883971&r2=883972&view=diff
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileFinderFactory.java (original)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileFinderFactory.java Wed Nov 25 03:31:47 2009
@@ -83,8 +83,6 @@
*</pre><p>
*
*
- * @author wolfgang.hoschek@cern.ch
- * @version 1.0, 09/24/99
* @see KnownDoubleQuantileEstimator
* @see UnknownDoubleQuantileEstimator
*/
@@ -113,26 +111,26 @@
* @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
*/
public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
- returnSamplingRate[0] = 1.0;
- if (epsilon<=0.0) {
- // no way around exact quantile search
- long[] result = new long[2];
- result[0]=1;
- result[1]=N;
- return result;
- }
- if (epsilon>=1.0 || delta>=1.0) {
- // can make any error we wish
- long[] result = new long[2];
- result[0]=2;
- result[1]=1;
- return result;
- }
-
- if (delta > 0.0) {
- return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
- }
- return known_N_compute_B_and_K_quick(N, epsilon);
+ returnSamplingRate[0] = 1.0;
+ if (epsilon<=0.0) {
+ // no way around exact quantile search
+ long[] result = new long[2];
+ result[0]=1;
+ result[1]=N;
+ return result;
+ }
+ if (epsilon>=1.0 || delta>=1.0) {
+ // can make any error we wish
+ long[] result = new long[2];
+ result[0]=2;
+ result[1]=1;
+ return result;
+ }
+
+ if (delta > 0.0) {
+ return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+ }
+ return known_N_compute_B_and_K_quick(N, epsilon);
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -143,92 +141,92 @@
* @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
*/
protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
- final int maxBuffers = 50;
- final int maxHeight = 50;
- final double N_double = (double) N;
- final double c = N_double * epsilon * 2.0;
- int[] heightMaximums = new int[maxBuffers-1];
-
- // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
- // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
- for (int b=2; b<=maxBuffers; b++) {
- int h = 3;
-
- while ( h<=maxHeight && // skip heights until x<=0
- (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
- (Arithmetic.binomial(b+h-3, h-3)) +
- (Arithmetic.binomial(b+h-3, h-2)) - c
- > 0.0
- ) {h++;}
- //from now on x is monotonically growing...
- while ( h<=maxHeight && // skip heights until x>0
- (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
- (Arithmetic.binomial(b+h-3, h-3)) +
- (Arithmetic.binomial(b+h-3, h-2)) - c
- <= 0.0
- ) {h++;}
- h--; //go back to last height
-
- // was x>0 or did we loop without finding anything?
- int hMax;
- if (h>=maxHeight &&
- (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
- (Arithmetic.binomial(b+h-3, h-3)) +
- (Arithmetic.binomial(b+h-3, h-2)) - c
- > 0.0) {
- hMax=Integer.MIN_VALUE;
- }
- else {
- hMax=h;
- }
-
- heightMaximums[b-2]=hMax; //safe some space
- } //end for
-
-
- // for each b, determine the smallest k satisfying the constraints, i.e.
- // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
- long[] kMinimums = new long[maxBuffers-1];
- for (int b=2; b<=maxBuffers; b++) {
- int h=heightMaximums[b-2];
- long kMin=Long.MAX_VALUE;
- if (h>Integer.MIN_VALUE) {
- double value = (Arithmetic.binomial(b+h-2, h-1));
- long tmpK=(long)(Math.ceil(N_double/value));
- if (tmpK<=Long.MAX_VALUE) {
- kMin = tmpK;
- }
- }
- kMinimums[b-2]=kMin;
- }
-
- // from all b's, determine b that minimizes b*kMin
- long multMin = Long.MAX_VALUE;
- int minB = -1;
- for (int b=2; b<=maxBuffers; b++) {
- if (kMinimums[b-2]<Long.MAX_VALUE) {
- long mult = ((long)b) * ((long)kMinimums[b-2]);
- if (mult<multMin) {
- multMin=mult;
- minB = b;
- }
- }
- }
-
- long b, k;
- if (minB != -1) { // epsilon large enough?
- b = minB;
- k = kMinimums[minB-2];
- }
- else { // epsilon is very small or zero.
- b = 1; // the only possible solution without violating the
- k = N; // approximation guarantees is exact quantile search.
- }
-
- long[] result = new long[2];
- result[0]=b;
- result[1]=k;
- return result;
+ final int maxBuffers = 50;
+ final int maxHeight = 50;
+ final double N_double = (double) N;
+ final double c = N_double * epsilon * 2.0;
+ int[] heightMaximums = new int[maxBuffers-1];
+
+ // for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+ // with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+ for (int b=2; b<=maxBuffers; b++) {
+ int h = 3;
+
+ while ( h<=maxHeight && // skip heights until x<=0
+ (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
+ (Arithmetic.binomial(b+h-3, h-3)) +
+ (Arithmetic.binomial(b+h-3, h-2)) - c
+ > 0.0
+ ) {h++;}
+ //from now on x is monotonically growing...
+ while ( h<=maxHeight && // skip heights until x>0
+ (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
+ (Arithmetic.binomial(b+h-3, h-3)) +
+ (Arithmetic.binomial(b+h-3, h-2)) - c
+ <= 0.0
+ ) {h++;}
+ h--; //go back to last height
+
+ // was x>0 or did we loop without finding anything?
+ int hMax;
+ if (h>=maxHeight &&
+ (h-2) * (Arithmetic.binomial(b+h-2, h-1)) -
+ (Arithmetic.binomial(b+h-3, h-3)) +
+ (Arithmetic.binomial(b+h-3, h-2)) - c
+ > 0.0) {
+ hMax=Integer.MIN_VALUE;
+ }
+ else {
+ hMax=h;
+ }
+
+ heightMaximums[b-2]=hMax; //safe some space
+ } //end for
+
+
+ // for each b, determine the smallest k satisfying the constraints, i.e.
+ // for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+ long[] kMinimums = new long[maxBuffers-1];
+ for (int b=2; b<=maxBuffers; b++) {
+ int h=heightMaximums[b-2];
+ long kMin=Long.MAX_VALUE;
+ if (h>Integer.MIN_VALUE) {
+ double value = (Arithmetic.binomial(b+h-2, h-1));
+ long tmpK=(long)(Math.ceil(N_double/value));
+ if (tmpK<=Long.MAX_VALUE) {
+ kMin = tmpK;
+ }
+ }
+ kMinimums[b-2]=kMin;
+ }
+
+ // from all b's, determine b that minimizes b*kMin
+ long multMin = Long.MAX_VALUE;
+ int minB = -1;
+ for (int b=2; b<=maxBuffers; b++) {
+ if (kMinimums[b-2]<Long.MAX_VALUE) {
+ long mult = ((long)b) * ((long)kMinimums[b-2]);
+ if (mult<multMin) {
+ multMin=mult;
+ minB = b;
+ }
+ }
+ }
+
+ long b, k;
+ if (minB != -1) { // epsilon large enough?
+ b = minB;
+ k = kMinimums[minB-2];
+ }
+ else { // epsilon is very small or zero.
+ b = 1; // the only possible solution without violating the
+ k = N; // approximation guarantees is exact quantile search.
+ }
+
+ long[] result = new long[2];
+ result[0]=b;
+ result[1]=k;
+ return result;
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -243,71 +241,71 @@
* @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
*/
protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
- final int maxBuffers = 50;
- final int maxHeight = 50;
- final double N_double = N;
-
- // One possibility is to use one buffer of size N
- //
- long ret_b = 1;
- long ret_k = N;
- double sampling_rate = 1.0;
- long memory = N;
-
-
- // Otherwise, there are at least two buffers (b >= 2)
- // and the height of the tree is at least three (h >= 3)
- //
- // We restrict the search for b and h to MAX_BINOM, a large enough value for
- // practical values of epsilon >= 0.001 and delta >= 0.00001
- //
- final double logarithm = Math.log(2.0*quantiles/delta);
- final double c = 2.0 * epsilon * N_double;
- for (long b=2 ; b<maxBuffers ; b++)
- for (long h=3 ; h<maxHeight ; h++) {
- double binomial = Arithmetic.binomial(b+h-2, h-1);
- long tmp = (long) Math.ceil(N_double / binomial);
- if ((b * tmp < memory) &&
- ((h-2) * binomial - Arithmetic.binomial(b+h-3, h-3) + Arithmetic.binomial(b+h-3, h-2)
- <= c) ) {
- ret_k = tmp ;
- ret_b = b ;
- memory = ret_k * b;
- sampling_rate = 1.0 ;
- }
- if (delta > 0.0) {
- double t = (h-2) * Arithmetic.binomial(b+h-2, h-1) - Arithmetic.binomial(b+h-3, h-3) + Arithmetic.binomial(b+h-3, h-2) ;
- double u = logarithm / epsilon ;
- double v = Arithmetic.binomial (b+h-2, h-1) ;
- double w = logarithm / (2.0*epsilon*epsilon) ;
-
- // From our SIGMOD 98 paper, we have two equantions to satisfy:
- // t <= u * alpha/(1-alpha)^2
- // kv >= w/(1-alpha)^2
- //
- // Denoting 1/(1-alpha) by x,
- // we see that the first inequality is equivalent to
- // t/u <= x^2 - x
- // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
- // Plugging in this value into second equation yields
- // k >= wx^2/v
-
- double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0*t/u) ;
- long k = (long) Math.ceil(w*x*x/v) ;
- if (b * k < memory) {
- ret_k = k ;
- ret_b = b ;
- memory = b * k ;
- sampling_rate = N_double*2.0*epsilon*epsilon / logarithm ;
- }
- }
- }
-
- long[] result = new long[2];
- result[0]=ret_b;
- result[1]=ret_k;
- returnSamplingRate[0]=sampling_rate;
- return result;
+ final int maxBuffers = 50;
+ final int maxHeight = 50;
+ final double N_double = N;
+
+ // One possibility is to use one buffer of size N
+ //
+ long ret_b = 1;
+ long ret_k = N;
+ double sampling_rate = 1.0;
+ long memory = N;
+
+
+ // Otherwise, there are at least two buffers (b >= 2)
+ // and the height of the tree is at least three (h >= 3)
+ //
+ // We restrict the search for b and h to MAX_BINOM, a large enough value for
+ // practical values of epsilon >= 0.001 and delta >= 0.00001
+ //
+ final double logarithm = Math.log(2.0*quantiles/delta);
+ final double c = 2.0 * epsilon * N_double;
+ for (long b=2 ; b<maxBuffers ; b++)
+ for (long h=3 ; h<maxHeight ; h++) {
+ double binomial = Arithmetic.binomial(b+h-2, h-1);
+ long tmp = (long) Math.ceil(N_double / binomial);
+ if ((b * tmp < memory) &&
+ ((h-2) * binomial - Arithmetic.binomial(b+h-3, h-3) + Arithmetic.binomial(b+h-3, h-2)
+ <= c) ) {
+ ret_k = tmp ;
+ ret_b = b ;
+ memory = ret_k * b;
+ sampling_rate = 1.0 ;
+ }
+ if (delta > 0.0) {
+ double t = (h-2) * Arithmetic.binomial(b+h-2, h-1) - Arithmetic.binomial(b+h-3, h-3) + Arithmetic.binomial(b+h-3, h-2) ;
+ double u = logarithm / epsilon ;
+ double v = Arithmetic.binomial (b+h-2, h-1) ;
+ double w = logarithm / (2.0*epsilon*epsilon) ;
+
+ // From our SIGMOD 98 paper, we have two equantions to satisfy:
+ // t <= u * alpha/(1-alpha)^2
+ // kv >= w/(1-alpha)^2
+ //
+ // Denoting 1/(1-alpha) by x,
+ // we see that the first inequality is equivalent to
+ // t/u <= x^2 - x
+ // which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+ // Plugging in this value into second equation yields
+ // k >= wx^2/v
+
+ double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0*t/u) ;
+ long k = (long) Math.ceil(w*x*x/v) ;
+ if (b * k < memory) {
+ ret_k = k ;
+ ret_b = b ;
+ memory = b * k ;
+ sampling_rate = N_double*2.0*epsilon*epsilon / logarithm ;
+ }
+ }
+ }
+
+ long[] result = new long[2];
+ result[0]=ret_b;
+ result[1]=ret_k;
+ returnSamplingRate[0]=sampling_rate;
+ return result;
}
/**
* Returns a quantile finder that minimizes the amount of memory needed under the user provided constraints.
@@ -318,8 +316,8 @@
*
* @param known_N specifies whether the number of elements over which quantiles are to be computed is known or not.
* @param N if <tt>known_N==true</tt>, the number of elements over which quantiles are to be computed.
- * if <tt>known_N==false</tt>, the upper limit on the number of elements over which quantiles are to be computed.
- * If such an upper limit is a-priori unknown, then set <tt>N = Long.MAX_VALUE</tt>.
+ * if <tt>known_N==false</tt>, the upper limit on the number of elements over which quantiles are to be computed.
+ * If such an upper limit is a-priori unknown, then set <tt>N = Long.MAX_VALUE</tt>.
* @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 <= epsilon <= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
* @param delta the probability that the approximation error is more than than epsilon (e.g. 0.0001) (0 <= delta <= 1). To avoid probabilistic answers, set <tt>delta=0.0</tt>.
* @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles >= 1</tt>). If unknown in advance, set this number large, e.g. <tt>quantiles >= 10000</tt>.
@@ -327,62 +325,62 @@
* @return the quantile finder minimizing memory requirements under the given constraints.
*/
public static DoubleQuantileFinder newDoubleQuantileFinder(boolean known_N, long N, double epsilon, double delta, int quantiles, RandomEngine generator) {
- //boolean known_N = true;
- //if (N==Long.MAX_VALUE) known_N = false;
- // check parameters.
- // if they are illegal, keep quite and return an exact finder.
- if (epsilon <= 0.0 || N < 1000) return new ExactDoubleQuantileFinder();
- if (epsilon > 1) epsilon = 1;
- if (delta < 0) delta = 0;
- if (delta > 1) delta = 1;
- if (quantiles < 1) quantiles = 1;
- if (quantiles > N) N = quantiles;
-
- KnownDoubleQuantileEstimator finder;
- if (known_N) {
- double[] samplingRate = new double[1];
- long[] resultKnown = known_N_compute_B_and_K(N, epsilon, delta, quantiles, samplingRate);
- long b = resultKnown[0];
- long k = resultKnown[1];
- if (b == 1) return new ExactDoubleQuantileFinder();
- return new KnownDoubleQuantileEstimator((int)b,(int)k, N, samplingRate[0], generator);
- }
- else {
- long[] resultUnknown = unknown_N_compute_B_and_K(epsilon, delta, quantiles);
- long b1 = resultUnknown[0];
- long k1 = resultUnknown[1];
- long h1 = resultUnknown[2];
- double preComputeEpsilon = -1.0;
- if (resultUnknown[3] == 1) preComputeEpsilon = epsilon;
-
- //if (N==Long.MAX_VALUE) { // no maximum N provided by user.
-
- // if (true) fixes bug reported by LarryPeranich@fairisaac.com
- if (true) { // no maximum N provided by user.
- if (b1 == 1) return new ExactDoubleQuantileFinder();
- return new UnknownDoubleQuantileEstimator((int)b1,(int)k1, (int)h1, preComputeEpsilon, generator);
- }
-
- // determine whether UnknownFinder or KnownFinder with maximum N requires less memory.
- double[] samplingRate = new double[1];
-
- // IMPORTANT: for known finder, switch sampling off (delta == 0) !!!
- // with knownN-sampling we can only guarantee the errors if the input sequence has EXACTLY N elements.
- // with knownN-no sampling we can also guarantee the errors for sequences SMALLER than N elements.
- long[] resultKnown = known_N_compute_B_and_K(N, epsilon, 0, quantiles, samplingRate);
-
- long b2 = resultKnown[0];
- long k2 = resultKnown[1];
-
- if (b2 * k2 < b1 * k1) { // the KnownFinder is smaller
- if (b2 == 1) return new ExactDoubleQuantileFinder();
- return new KnownDoubleQuantileEstimator((int)b2,(int)k2, N, samplingRate[0], generator);
- }
-
- // the UnknownFinder is smaller
- if (b1 == 1) return new ExactDoubleQuantileFinder();
- return new UnknownDoubleQuantileEstimator((int)b1,(int)k1, (int)h1, preComputeEpsilon, generator);
- }
+ //boolean known_N = true;
+ //if (N==Long.MAX_VALUE) known_N = false;
+ // check parameters.
+ // if they are illegal, keep quite and return an exact finder.
+ if (epsilon <= 0.0 || N < 1000) return new ExactDoubleQuantileFinder();
+ if (epsilon > 1) epsilon = 1;
+ if (delta < 0) delta = 0;
+ if (delta > 1) delta = 1;
+ if (quantiles < 1) quantiles = 1;
+ if (quantiles > N) N = quantiles;
+
+ KnownDoubleQuantileEstimator finder;
+ if (known_N) {
+ double[] samplingRate = new double[1];
+ long[] resultKnown = known_N_compute_B_and_K(N, epsilon, delta, quantiles, samplingRate);
+ long b = resultKnown[0];
+ long k = resultKnown[1];
+ if (b == 1) return new ExactDoubleQuantileFinder();
+ return new KnownDoubleQuantileEstimator((int)b,(int)k, N, samplingRate[0], generator);
+ }
+ else {
+ long[] resultUnknown = unknown_N_compute_B_and_K(epsilon, delta, quantiles);
+ long b1 = resultUnknown[0];
+ long k1 = resultUnknown[1];
+ long h1 = resultUnknown[2];
+ double preComputeEpsilon = -1.0;
+ if (resultUnknown[3] == 1) preComputeEpsilon = epsilon;
+
+ //if (N==Long.MAX_VALUE) { // no maximum N provided by user.
+
+ // if (true) fixes bug reported by LarryPeranich@fairisaac.com
+ if (true) { // no maximum N provided by user.
+ if (b1 == 1) return new ExactDoubleQuantileFinder();
+ return new UnknownDoubleQuantileEstimator((int)b1,(int)k1, (int)h1, preComputeEpsilon, generator);
+ }
+
+ // determine whether UnknownFinder or KnownFinder with maximum N requires less memory.
+ double[] samplingRate = new double[1];
+
+ // IMPORTANT: for known finder, switch sampling off (delta == 0) !!!
+ // with knownN-sampling we can only guarantee the errors if the input sequence has EXACTLY N elements.
+ // with knownN-no sampling we can also guarantee the errors for sequences SMALLER than N elements.
+ long[] resultKnown = known_N_compute_B_and_K(N, epsilon, 0, quantiles, samplingRate);
+
+ long b2 = resultKnown[0];
+ long k2 = resultKnown[1];
+
+ if (b2 * k2 < b1 * k1) { // the KnownFinder is smaller
+ if (b2 == 1) return new ExactDoubleQuantileFinder();
+ return new KnownDoubleQuantileEstimator((int)b2,(int)k2, N, samplingRate[0], generator);
+ }
+
+ // the UnknownFinder is smaller
+ if (b1 == 1) return new ExactDoubleQuantileFinder();
+ return new UnknownDoubleQuantileEstimator((int)b1,(int)k1, (int)h1, preComputeEpsilon, generator);
+ }
}
/**
* Convenience method that computes phi's for equi-depth histograms.
@@ -390,9 +388,9 @@
* @return the equi-depth phi's
*/
public static org.apache.mahout.matrix.list.DoubleArrayList newEquiDepthPhis(int quantiles) {
- org.apache.mahout.matrix.list.DoubleArrayList phis = new org.apache.mahout.matrix.list.DoubleArrayList(quantiles-1);
- for (int i=1; i<=quantiles-1; i++) phis.add(i / (double)quantiles);
- return phis;
+ org.apache.mahout.matrix.list.DoubleArrayList phis = new org.apache.mahout.matrix.list.DoubleArrayList(quantiles-1);
+ for (int i=1; i<=quantiles-1; i++) phis.add(i / (double)quantiles);
+ return phis;
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -404,30 +402,30 @@
* @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is better, otherwise 0;
*/
public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
- return unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
- // move stuff from _raw(..) here and delete _raw(...)
-
- /*
- long[] result_1 = unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
- long b1 = result_1[0];
- long k1 = result_1[1];
-
-
- int quantilesToPrecompute = (int) Doubles.ceiling(1.0 / epsilon);
-
- if (quantiles>quantilesToPrecompute) {
- // try if precomputing quantiles requires less memory.
- long[] result_2 = unknown_N_compute_B_and_K_raw(epsilon/2.0,delta,quantilesToPrecompute);
-
- long b2 = result_2[0];
- long k2 = result_2[1];
- if (b2*k2 < b1*k1) {
- result_2[3] = 1; //precomputation is better
- result_1 = result_2;
- }
- }
- return result_1;
- */
+ return unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
+ // move stuff from _raw(..) here and delete _raw(...)
+
+ /*
+ long[] result_1 = unknown_N_compute_B_and_K_raw(epsilon,delta,quantiles);
+ long b1 = result_1[0];
+ long k1 = result_1[1];
+
+
+ int quantilesToPrecompute = (int) Doubles.ceiling(1.0 / epsilon);
+
+ if (quantiles>quantilesToPrecompute) {
+ // try if precomputing quantiles requires less memory.
+ long[] result_2 = unknown_N_compute_B_and_K_raw(epsilon/2.0,delta,quantilesToPrecompute);
+
+ long b2 = result_2[0];
+ long k2 = result_2[1];
+ if (b2*k2 < b1*k1) {
+ result_2[3] = 1; //precomputation is better
+ result_1 = result_2;
+ }
+ }
+ return result_1;
+ */
}
/**
* Computes the number of buffers and number of values per buffer such that
@@ -440,136 +438,136 @@
* @return <tt>long[4]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>long[2]</tt>=the tree height where sampling shall start, <tt>long[3]==1</tt> if precomputing is better, otherwise 0;
*/
protected static long[] unknown_N_compute_B_and_K_raw(double epsilon, double delta, int quantiles) {
- // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
- if (epsilon<=0.0) {
- long[] result = new long[4];
- result[0]=1;
- result[1]=Long.MAX_VALUE;
- result[2]=Long.MAX_VALUE;
- result[3]=0;
- return result;
- }
- if (epsilon>=1.0 || delta>=1.0) {
- // can make any error we wish
- long[] result = new long[4];
- result[0]=2;
- result[1]=1;
- result[2]=3;
- result[3]=0;
- return result;
- }
- if (delta<=0.0) {
- // no way around exact quantile search
- long[] result = new long[4];
- result[0]=1;
- result[1]=Long.MAX_VALUE;
- result[2]=Long.MAX_VALUE;
- result[3]=0;
- return result;
- }
-
- int max_b = 50;
- int max_h = 50;
- int max_H = 50;
- int max_Iterations = 2;
-
- long best_b = Long.MAX_VALUE;
- long best_k = Long.MAX_VALUE;
- long best_h = Long.MAX_VALUE;
- long best_memory = Long.MAX_VALUE;
-
- double pow = Math.pow(2.0, max_H);
- double logDelta = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon);
- //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
-
- while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
- // identify that combination of b and h that minimizes b*k.
- // exhaustive search.
- for (int b=2; b<=max_b; b++) {
- for (int h=2; h<=max_h; h++) {
- double Ld = Arithmetic.binomial(b+h-2, h-1);
- double Ls = Arithmetic.binomial(b+h-3,h-1);
-
- // now we have k>=c*(1-alpha)^-2.
- // let's compute c.
- //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
- double c = logDelta / Math.min(Ld, 8.0*Ls/3.0);
-
- // now we have k>=d/alpha.
- // let's compute d.
- double beta = Ld/Ls;
- double cc = (beta-2.0)*(max_H-2.0) / (beta + pow - 2.0);
- double d = (h+3+cc) / (2.0*epsilon);
-
- /*
- double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
- d = (d + 2.0) / (2.0*epsilon);
- */
-
- // now we have c*(1-alpha)^-2 == d/alpha.
- // we solve this equation for alpha yielding two solutions
- // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
- double f = c*c + 4.0*c*d;
- if (f<0.0) continue; // non real solution to equation
- double root = Math.sqrt(f);
- double alpha_one = (c + 2.0*d + root)/(2.0*d);
- double alpha_two = (c + 2.0*d - root)/(2.0*d);
-
- // any alpha must satisfy 0<alpha<1 to yield valid solutions
- boolean alpha_one_OK = false;
- boolean alpha_two_OK = false;
- if (0.0 < alpha_one && alpha_one < 1.0) alpha_one_OK = true;
- if (0.0 < alpha_two && alpha_two < 1.0) alpha_two_OK = true;
- if (alpha_one_OK || alpha_two_OK) {
- double alpha = alpha_one;
- if (alpha_one_OK && alpha_two_OK) {
- // take the alpha that minimizes d/alpha
- alpha = Math.max(alpha_one, alpha_two);
- }
- else if (alpha_two_OK) {
- alpha = alpha_two;
- }
-
- // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
- long k = (long) Math.ceil(Math.max(d/alpha, (h+1)/(2.0*epsilon)));
- if (k>0) { // valid solution?
- long memory = b*k;
- if (memory < best_memory) {
- // found a solution requiring less memory
- best_k = k;
- best_b = b;
- best_h = h;
- best_memory = memory;
- }
- }
- }
- } //end for h
- } //end for b
-
- if (best_b == Long.MAX_VALUE) {
- System.out.println("Warning: Computing b and k looks like a lot of work!");
- // no solution found so far. very unlikely. Anyway, try again.
- max_b *= 2;
- max_h *= 2;
- max_H *= 2;
- }
- } //end while
-
- long[] result = new long[4];
- result[3]=0;
- if (best_b == Long.MAX_VALUE) {
- // no solution found.
- // no way around exact quantile search.
- result[0]=1;
- result[1]=Long.MAX_VALUE;
- result[2]=Long.MAX_VALUE;
- }
- else {
- result[0]=best_b;
- result[1]=best_k;
- result[2]=best_h;
- }
+ // delta can be set to zero, i.e., all quantiles should be approximate with probability 1
+ if (epsilon<=0.0) {
+ long[] result = new long[4];
+ result[0]=1;
+ result[1]=Long.MAX_VALUE;
+ result[2]=Long.MAX_VALUE;
+ result[3]=0;
+ return result;
+ }
+ if (epsilon>=1.0 || delta>=1.0) {
+ // can make any error we wish
+ long[] result = new long[4];
+ result[0]=2;
+ result[1]=1;
+ result[2]=3;
+ result[3]=0;
+ return result;
+ }
+ if (delta<=0.0) {
+ // no way around exact quantile search
+ long[] result = new long[4];
+ result[0]=1;
+ result[1]=Long.MAX_VALUE;
+ result[2]=Long.MAX_VALUE;
+ result[3]=0;
+ return result;
+ }
+
+ int max_b = 50;
+ int max_h = 50;
+ int max_H = 50;
+ int max_Iterations = 2;
+
+ long best_b = Long.MAX_VALUE;
+ long best_k = Long.MAX_VALUE;
+ long best_h = Long.MAX_VALUE;
+ long best_memory = Long.MAX_VALUE;
+
+ double pow = Math.pow(2.0, max_H);
+ double logDelta = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon);
+ //double logDelta = Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+ while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+ // identify that combination of b and h that minimizes b*k.
+ // exhaustive search.
+ for (int b=2; b<=max_b; b++) {
+ for (int h=2; h<=max_h; h++) {
+ double Ld = Arithmetic.binomial(b+h-2, h-1);
+ double Ls = Arithmetic.binomial(b+h-3,h-1);
+
+ // now we have k>=c*(1-alpha)^-2.
+ // let's compute c.
+ //double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+ double c = logDelta / Math.min(Ld, 8.0*Ls/3.0);
+
+ // now we have k>=d/alpha.
+ // let's compute d.
+ double beta = Ld/Ls;
+ double cc = (beta-2.0)*(max_H-2.0) / (beta + pow - 2.0);
+ double d = (h+3+cc) / (2.0*epsilon);
+
+ /*
+ double d = (Ld*(h+max_H-1.0) + Ls*((h+1)*pow - 2.0*(h+max_H))) / (Ld + Ls*(pow-2.0));
+ d = (d + 2.0) / (2.0*epsilon);
+ */
+
+ // now we have c*(1-alpha)^-2 == d/alpha.
+ // we solve this equation for alpha yielding two solutions
+ // alpha_1,2 = (c + 2*d +- Sqrt(c*c + 4*c*d))/(2*d)
+ double f = c*c + 4.0*c*d;
+ if (f<0.0) continue; // non real solution to equation
+ double root = Math.sqrt(f);
+ double alpha_one = (c + 2.0*d + root)/(2.0*d);
+ double alpha_two = (c + 2.0*d - root)/(2.0*d);
+
+ // any alpha must satisfy 0<alpha<1 to yield valid solutions
+ boolean alpha_one_OK = false;
+ boolean alpha_two_OK = false;
+ if (0.0 < alpha_one && alpha_one < 1.0) alpha_one_OK = true;
+ if (0.0 < alpha_two && alpha_two < 1.0) alpha_two_OK = true;
+ if (alpha_one_OK || alpha_two_OK) {
+ double alpha = alpha_one;
+ if (alpha_one_OK && alpha_two_OK) {
+ // take the alpha that minimizes d/alpha
+ alpha = Math.max(alpha_one, alpha_two);
+ }
+ else if (alpha_two_OK) {
+ alpha = alpha_two;
+ }
+
+ // now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+ long k = (long) Math.ceil(Math.max(d/alpha, (h+1)/(2.0*epsilon)));
+ if (k>0) { // valid solution?
+ long memory = b*k;
+ if (memory < best_memory) {
+ // found a solution requiring less memory
+ best_k = k;
+ best_b = b;
+ best_h = h;
+ best_memory = memory;
+ }
+ }
+ }
+ } //end for h
+ } //end for b
+
+ if (best_b == Long.MAX_VALUE) {
+ System.out.println("Warning: Computing b and k looks like a lot of work!");
+ // no solution found so far. very unlikely. Anyway, try again.
+ max_b *= 2;
+ max_h *= 2;
+ max_H *= 2;
+ }
+ } //end while
+
+ long[] result = new long[4];
+ result[3]=0;
+ if (best_b == Long.MAX_VALUE) {
+ // no solution found.
+ // no way around exact quantile search.
+ result[0]=1;
+ result[1]=Long.MAX_VALUE;
+ result[2]=Long.MAX_VALUE;
+ }
+ else {
+ result[0]=best_b;
+ result[1]=best_k;
+ result[2]=best_h;
+ }
- return result;
+ return result;
}
}