You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/11/23 16:14:38 UTC

svn commit: r883365 [8/47] - in /lucene/mahout/trunk: ./ examples/ matrix/ matrix/src/ matrix/src/main/ matrix/src/main/java/ matrix/src/main/java/org/ matrix/src/main/java/org/apache/ matrix/src/main/java/org/apache/mahout/ matrix/src/main/java/org/ap...

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Buffer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Buffer.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Buffer.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Buffer.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,85 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+/**
+ * A buffer holding elements; internally used for computing approximate quantiles.
+ */
+abstract class Buffer extends org.apache.mahout.colt.PersistentObject {
+	protected int weight;
+	protected int level;
+	protected int k;
+	protected boolean isAllocated;
+/**
+ * This method was created in VisualAge.
+ * @param k int
+ */
+public Buffer(int k) {
+	this.k=k;
+	this.weight=1;
+	this.level=0;
+	this.isAllocated=false;
+}
+/**
+ * Clears the receiver.
+ */
+public abstract void clear();
+/**
+ * Returns whether the receiver is already allocated.
+ */
+public boolean isAllocated() {
+	return isAllocated;
+}
+/**
+ * Returns whether the receiver is empty.
+ */
+public abstract boolean isEmpty();
+/**
+ * Returns whether the receiver is empty.
+ */
+public abstract boolean isFull();
+/**
+ * Returns whether the receiver is partial.
+ */
+public boolean isPartial() {
+	return ! (isEmpty() || isFull());
+}
+/**
+ * Returns whether the receiver's level.
+ */
+public int level() {
+	return level;
+}
+/**
+ * Sets the receiver's level.
+ */
+public void level(int level) {
+	this.level = level;
+}
+/**
+ * Returns the number of elements contained in the receiver.
+ */
+public abstract int size();
+/**
+ * Sorts the receiver.
+ */
+public abstract void sort();
+/**
+ * Returns whether the receiver's weight.
+ */
+public int weight() {
+	return weight;
+}
+/**
+ * Sets the receiver's weight.
+ */
+public void weight(int weight) {
+	this.weight = weight;
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Buffer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/BufferSet.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/BufferSet.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/BufferSet.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/BufferSet.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,15 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+/**
+ * An abstract set of buffers; internally used for computing approximate quantiles.
+ */
+abstract class BufferSet extends org.apache.mahout.colt.PersistentObject {
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/BufferSet.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBuffer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBuffer.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBuffer.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBuffer.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,134 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+import org.apache.mahout.colt.list.DoubleArrayList;
+/**
+ * A buffer holding <tt>double</tt> elements; internally used for computing approximate quantiles.
+ */
+class DoubleBuffer extends Buffer {
+	protected DoubleArrayList values;
+	protected boolean isSorted;
+/**
+ * This method was created in VisualAge.
+ * @param k int
+ */
+public DoubleBuffer(int k) {
+	super(k);
+	this.values = new DoubleArrayList(0);
+	this.isSorted = false;
+}
+/**
+ * Adds a value to the receiver.
+ */
+public void add(double value) {
+	if (! isAllocated) allocate(); // lazy buffer allocation can safe memory.
+	values.add(value);
+	this.isSorted = false;
+}
+/**
+ * Adds a value to the receiver.
+ */
+public void addAllOfFromTo(DoubleArrayList elements,int from, int to) {
+	if (! isAllocated) allocate(); // lazy buffer allocation can safe memory.
+	values.addAllOfFromTo(elements,from,to);
+	this.isSorted = false;
+}
+/**
+ * Allocates the receiver.
+ */
+protected void allocate() {
+	isAllocated = true;
+	values.ensureCapacity(k);
+}
+/**
+ * Clears the receiver.
+ */
+public void clear() {
+	values.clear();
+}
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public Object clone() {
+	DoubleBuffer copy = (DoubleBuffer) super.clone();
+	if (this.values != null) copy.values = copy.values.copy();
+	return copy;
+}
+/**
+ * Returns whether the specified element is contained in the receiver.
+ */
+public boolean contains(double element) {
+	this.sort();
+	return values.contains(element);
+}
+/**
+ * Returns whether the receiver is empty.
+ */
+public boolean isEmpty() {
+	return values.size()==0;
+}
+/**
+ * Returns whether the receiver is empty.
+ */
+public boolean isFull() {
+	return values.size()==k;
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying algorithm.
+ */
+public int memory() {
+	return values.elements().length;
+}
+/**
+ * Returns the rank of a given element within the sorted sequence of the receiver.
+ * A rank is the number of elements <= element.
+ * Ranks are of the form {1,2,...size()}.
+ * If no element is <= element, then the rank is zero.
+ * If the element lies in between two contained elements, then uses linear interpolation.
+ * @return the rank of the element.
+ * @param list org.apache.mahout.colt.list.DoubleArrayList
+ * @param element the element to search for
+ */
+public double rank(double element) {
+	this.sort();
+	return org.apache.mahout.jet.stat.Descriptive.rankInterpolated(this.values, element);
+}
+/**
+ * Returns the number of elements contained in the receiver.
+ */
+public int size() {
+	return values.size();
+}
+/**
+ * Sorts the receiver.
+ */
+public void sort() {
+	if (! this.isSorted) {
+		// IMPORTANT: TO DO : replace mergeSort with quickSort!
+		// currently it is mergeSort only for debugging purposes (JDK 1.2 can't be imported into VisualAge).
+		values.sort();
+		//values.mergeSort();
+		this.isSorted = true;
+	}
+}
+/**
+ * Returns a String representation of the receiver.
+ */
+public String toString() {
+	return	"k="+this.k +
+			", w="+Long.toString(weight()) +
+			", l="+Integer.toString(level()) + 
+			", size=" + values.size();
+			//", v=" + values.toString();
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBuffer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBufferSet.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBufferSet.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBufferSet.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBufferSet.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,392 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+/**
+ * A set of buffers holding <tt>double</tt> elements; internally used for computing approximate quantiles.
+ */
+class DoubleBufferSet extends BufferSet {
+	protected DoubleBuffer[] buffers;
+	private boolean nextTriggerCalculationState; //tmp var only
+/**
+ * Constructs a buffer set with b buffers, each having k elements
+ * @param b the number of buffers
+ * @param k the number of elements per buffer
+ */
+public DoubleBufferSet(int b, int k) {
+	this.buffers = new DoubleBuffer[b];
+	this.clear(k);
+}
+/**
+ * Returns an empty buffer if at least one exists.
+ * Preferably returns a buffer which has already been used,
+ * i.e. a buffer which has already been allocated.
+ */
+public DoubleBuffer _getFirstEmptyBuffer() {
+	DoubleBuffer emptyBuffer = null;
+	for (int i=buffers.length; --i >= 0;) {
+		if (buffers[i].isEmpty()) {
+			if (buffers[i].isAllocated()) return buffers[i];
+			emptyBuffer = buffers[i];
+		}
+	}
+		
+	return emptyBuffer;
+}
+/**
+ * Returns all full or partial buffers.
+ */
+public DoubleBuffer[] _getFullOrPartialBuffers() {
+	//count buffers
+	int count = 0;
+	for (int i=buffers.length; --i >= 0;) {
+		if (! buffers[i].isEmpty()) count++;
+	}
+
+	//collect buffers
+	DoubleBuffer[] collectedBuffers = new DoubleBuffer[count];
+	int j=0;
+	for (int i=buffers.length; --i >= 0;) {
+		if (! buffers[i].isEmpty()) {
+			collectedBuffers[j++]=buffers[i];
+		}
+	}
+
+	return collectedBuffers;
+}
+/**
+ * Determines all full buffers having the specified level.
+ * @return all full buffers having the specified level
+ */
+public DoubleBuffer[] _getFullOrPartialBuffersWithLevel(int level) {
+	//count buffers
+	int count = 0;
+	for (int i=buffers.length; --i >= 0;) {
+		if ((! buffers[i].isEmpty()) && buffers[i].level()==level) count++;
+	}
+
+	//collect buffers
+	DoubleBuffer[] collectedBuffers = new DoubleBuffer[count];
+	int j=0;
+	for (int i=buffers.length; --i >= 0;) {
+		if ((! buffers[i].isEmpty()) && buffers[i].level()==level) {
+			collectedBuffers[j++]=buffers[i];
+		}
+	}
+
+	return collectedBuffers;
+}
+/**
+ * @return The minimum level of all buffers which are full.
+ */
+public int _getMinLevelOfFullOrPartialBuffers() {
+	int b=this.b();
+	int minLevel=Integer.MAX_VALUE;
+	DoubleBuffer buffer;
+	
+	for (int i=0; i<b; i++) {
+		buffer = this.buffers[i];
+		if ((! buffer.isEmpty()) && (buffer.level() < minLevel)) {
+			minLevel = buffer.level();
+		}
+	}
+	return minLevel;
+}
+/**
+ * Returns the number of empty buffers.
+ */
+public int _getNumberOfEmptyBuffers() {
+	int count = 0;
+	for (int i=buffers.length; --i >= 0;) {
+		if (buffers[i].isEmpty()) count++;
+	}
+
+	return count;
+}
+/**
+ * Returns all empty buffers.
+ */
+public DoubleBuffer _getPartialBuffer() {
+	for (int i=buffers.length; --i >= 0;) {
+		if (buffers[i].isPartial()) return buffers[i];
+	}
+	return null;
+}
+/**
+ * @return the number of buffers
+ */
+public int b() {
+	return buffers.length;
+}
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+public void clear() {
+	clear(k());
+}
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+protected void clear(int k) {
+	for (int i=b(); --i >=0; ) this.buffers[i]=new DoubleBuffer(k);
+	this.nextTriggerCalculationState = true;
+}
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public Object clone() {
+	DoubleBufferSet copy = (DoubleBufferSet) super.clone();
+
+	copy.buffers = (DoubleBuffer[]) copy.buffers.clone();
+	for (int i=buffers.length; --i >= 0; ) {
+		copy.buffers[i] = (DoubleBuffer) copy.buffers[i].clone();
+	}
+	return copy;
+}
+/**
+ * Collapses the specified full buffers (must not include partial buffer).
+ * @return a full buffer containing the collapsed values. The buffer has accumulated weight.
+ * @param buffers the buffers to be collapsed (all of them must be full or partially full)
+ */
+public DoubleBuffer collapse(DoubleBuffer[] buffers) {
+	//determine W
+	int W=0;								//sum of all weights
+	for (int i=0; i<buffers.length; i++) { W += buffers[i].weight(); }
+
+	//determine outputTriggerPositions
+	int k=this.k();
+	long[] triggerPositions = new long[k]; 
+	for (int j=0; j<k; j++) { triggerPositions[j]=this.nextTriggerPosition(j,W); }
+
+	//do the main work: determine values at given positions in sorted sequence
+	double[] outputValues = this.getValuesAtPositions(buffers, triggerPositions);
+
+	//mark all full buffers as empty, except the first, which will contain the output
+	for (int b=1; b<buffers.length; b++) buffers[b].clear(); 
+
+	DoubleBuffer outputBuffer = buffers[0];
+	outputBuffer.values.elements(outputValues);
+	outputBuffer.weight(W);
+	
+	return outputBuffer;
+}
+/**
+ * Returns whether the specified element is contained in the receiver.
+ */
+public boolean contains(double element) {
+	for (int i=buffers.length; --i >= 0;) {
+		if ((! buffers[i].isEmpty()) && buffers[i].contains(element)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+/**
+ * Applies a procedure to each element of the receiver, if any.
+ * Iterates over the receiver in no particular order.
+ * @param procedure    the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise continues. 
+ */
+public boolean forEach(org.apache.mahout.colt.function.DoubleProcedure procedure) {
+	for (int i=buffers.length; --i >=0; ) {
+		for (int w=buffers[i].weight(); --w >=0; ) {
+			if (! (buffers[i].values.forEach(procedure))) return false;
+		}
+	}
+	return true;
+}
+/**
+ * Determines all values of the specified buffers positioned at the specified triggerPositions within the sorted sequence and fills them into outputValues.
+ * @param buffers the buffers to be searched (all must be full or partial) 
+ * @param triggerPositions the positions of elements within the sorted sequence to be retrieved
+ * @return outputValues a list filled with the values at triggerPositions
+ */
+protected double[] getValuesAtPositions(DoubleBuffer[] buffers, long[] triggerPositions) {
+	//if (buffers.length==0) 
+	//{
+	//	throw new IllegalArgumentException("Oops! buffer.length==0.");
+	//}
+
+	//System.out.println("triggers="+cern.it.util.Arrays.toString(positions));
+
+	//new DoubleArrayList(outputValues).fillFromToWith(0, outputValues.length-1, 0.0f);
+	//delte the above line, it is only for testing
+	
+	//cern.it.util.Log.println("\nEntering getValuesAtPositions...");
+	//cern.it.util.Log.println("hitPositions="+cern.it.util.Arrays.toString(positions));
+
+	// sort buffers.
+	for (int i=buffers.length; --i >= 0; ) {
+		buffers[i].sort();
+	}
+
+	// collect some infos into fast cache; for tuning purposes only.
+	int[] bufferSizes = new int[buffers.length];		
+	double[][] bufferValues = new double[buffers.length][];
+	int totalBuffersSize = 0;
+	for (int i=buffers.length; --i >= 0; ) {
+		bufferSizes[i] = buffers[i].size();
+		bufferValues[i] = buffers[i].values.elements();
+		totalBuffersSize += bufferSizes[i];
+		//cern.it.util.Log.println("buffer["+i+"]="+buffers[i].values);
+	}
+	
+	// prepare merge of equi-distant elements within buffers into output values
+	
+	// first collect some infos into fast cache; for tuning purposes only.
+	final int buffersSize = buffers.length;		
+	final int triggerPositionsLength = triggerPositions.length; 
+
+	// now prepare the important things.
+	int j=0; 								//current position in collapsed values
+	int[] cursors = new int[buffers.length];	//current position in each buffer; init with zeroes
+	long counter = 0; 						//current position in sorted sequence
+	long nextHit = triggerPositions[j];		//next position in sorted sequence to trigger output population
+	double[] outputValues = new double[triggerPositionsLength];
+
+	if (totalBuffersSize==0) {
+		// nothing to output, because no elements have been filled (we are empty).
+		// return meaningless values
+		for (int i=0; i<triggerPositions.length; i++) {
+			outputValues[i] = Double.NaN;
+		}
+		return outputValues;
+	}
+	
+	// fill all output values with equi-distant elements.
+	while (j<triggerPositionsLength) { 
+		//System.out.println("\nj="+j);
+		//System.out.println("counter="+counter);
+		//System.out.println("nextHit="+nextHit);
+
+		// determine buffer with smallest value at cursor position.
+		double minValue = Double.POSITIVE_INFINITY;
+		int minBufferIndex = -1;
+
+		for (int b=buffersSize; --b >= 0; ) {
+			//DoubleBuffer buffer = buffers[b];
+			//if (cursors[b] < buffer.length) { 
+			if (cursors[b] < bufferSizes[b]) { 
+				///double value = buffer.values[cursors[b]];
+				double value = bufferValues[b][cursors[b]];
+				if (value <= minValue) {
+					minValue = value;
+					minBufferIndex = b;
+				}
+			}
+		}
+
+		DoubleBuffer minBuffer=buffers[minBufferIndex];
+
+		// trigger copies into output sequence, if necessary.
+		counter += minBuffer.weight();
+		while (counter>nextHit && j<triggerPositionsLength) { 
+			outputValues[j++]=minValue;
+			//System.out.println("adding to output="+minValue);
+			if (j<triggerPositionsLength) nextHit=triggerPositions[j];
+		}
+
+
+		// that element has now been treated, move further.
+		cursors[minBufferIndex]++;
+		//System.out.println("cursors="+cern.it.util.Arrays.toString(cursors));
+	
+	} //end while (j<k)
+	
+	//cern.it.util.Log.println("returning output="+cern.it.util.Arrays.toString(outputValues));
+	return outputValues;
+}
+/**
+ * @return the number of elements within a buffer.
+ */
+public int k() {
+	return buffers[0].k;
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ */
+public long memory() {
+	long memory=0;
+	for (int i=buffers.length; --i >=0;) {
+		memory = memory + buffers[i].memory();
+	}
+	return memory;
+}
+/**
+ * Computes the next triggerPosition for collapse
+ * @return the next triggerPosition for collapse
+ * @param j specifies that the j-th trigger position is to be computed
+ * @param W the accumulated weights
+ */
+protected long nextTriggerPosition(int j, long W) {
+	long nextTriggerPosition;
+	
+	if (W%2L != 0) { //is W odd?
+		nextTriggerPosition=j*W + (W+1)/2;
+	}
+	
+	else { //W is even
+		//alternate between both possible next hit positions upon successive invocations
+		if (nextTriggerCalculationState) {
+			nextTriggerPosition=j*W + W/2;
+		}
+		else {
+			nextTriggerPosition=j*W + (W+2)/2;
+		}
+	}
+			
+	return nextTriggerPosition;
+}
+/**
+ * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>.
+ * Does linear interpolation if the element is not contained but lies in between two contained elements.
+ *
+ * @param the element to search for.
+ * @return the percentage <tt>p</tt> of elements <tt>&lt;= element</tt> (<tt>0.0 &lt;= p &lt;=1.0)</tt>.
+ */
+public double phi(double element) {
+	double elementsLessThanOrEqualToElement=0.0;							
+	for (int i=buffers.length; --i >= 0; ) {
+		if (! buffers[i].isEmpty()) {
+			elementsLessThanOrEqualToElement += buffers[i].weight * buffers[i].rank(element);
+		}
+	}
+
+	return elementsLessThanOrEqualToElement / totalSize();
+}
+/**
+ * @return a String representation of the receiver
+ */
+public String toString() {
+	StringBuffer buf = new StringBuffer();
+	for (int b=0; b<this.b(); b++) {
+		if (! buffers[b].isEmpty()) {
+			buf.append("buffer#"+b+" = ");
+			buf.append(buffers[b].toString()+"\n");
+		}
+	}
+	return buf.toString();
+}
+/**
+ * Returns the number of elements in all buffers.
+ */
+public long totalSize() {
+	DoubleBuffer[] fullBuffers = _getFullOrPartialBuffers();
+	long totalSize=0;							
+	for (int i=fullBuffers.length; --i >= 0; ) {
+		totalSize += fullBuffers[i].size() * fullBuffers[i].weight();
+	}
+
+	return totalSize;
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleBufferSet.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileEstimator.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileEstimator.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,260 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+import org.apache.mahout.colt.list.DoubleArrayList;
+import org.apache.mahout.colt.list.ObjectArrayList;
+/**
+  * The abstract base class for approximate quantile finders computing quantiles over a sequence of <tt>double</tt> elements.
+  */
+//abstract class ApproximateDoubleQuantileFinder extends Object implements DoubleQuantileFinder {
+abstract class DoubleQuantileEstimator extends org.apache.mahout.colt.PersistentObject implements DoubleQuantileFinder {
+	protected DoubleBufferSet bufferSet;
+	protected DoubleBuffer currentBufferToFill;
+	protected int totalElementsFilled;
+/**
+ * Makes this class non instantiable, but still let's others inherit from it.
+ */
+protected DoubleQuantileEstimator() {}
+/**
+ * Adds a value to the receiver.
+ * @param value the value to add.
+ */
+public void add(double value) {
+	totalElementsFilled++;
+	if (! sampleNextElement()) return;
+
+	//System.out.println("adding "+value);
+
+	if (currentBufferToFill==null) { 
+		if (bufferSet._getFirstEmptyBuffer()==null) collapse();
+		newBuffer();
+	}
+
+	currentBufferToFill.add(value);
+	if (currentBufferToFill.isFull()) currentBufferToFill = null;
+}
+/**
+ * Adds all values of the specified list to the receiver.
+ * @param values the list of which all values shall be added.
+ */
+public void addAllOf(DoubleArrayList values) {
+	addAllOfFromTo(values,0,values.size()-1);
+}
+/**
+ * Adds the part of the specified list between indexes <tt>from</tt> (inclusive) and <tt>to</tt> (inclusive) to the receiver.
+ *
+ * @param values the list of which elements shall be added.
+ * @param from the index of the first element to be added (inclusive).
+ * @param to the index of the last element to be added (inclusive).
+ */
+public void addAllOfFromTo(DoubleArrayList values, int from, int to) {
+	/*
+	// the obvious version, but we can do quicker...
+	double[] theValues = values.elements();
+	int theSize=values.size();
+	for (int i=0; i<theSize; ) add(theValues[i++]);
+	*/
+	
+	double[] valuesToAdd = values.elements();
+	int k = this.bufferSet.k();
+	int bufferSize = k;
+	double[] bufferValues = null;
+	if (currentBufferToFill != null) {
+		bufferValues = currentBufferToFill.values.elements();
+		bufferSize = currentBufferToFill.size();
+	}
+
+	for (int i=from-1; ++i <= to; ) {
+		if (sampleNextElement()) {
+			if (bufferSize == k) { // full
+				if (bufferSet._getFirstEmptyBuffer()==null) collapse();
+				newBuffer();
+				if (!currentBufferToFill.isAllocated) currentBufferToFill.allocate();
+				currentBufferToFill.isSorted = false;
+				bufferValues = currentBufferToFill.values.elements();
+				bufferSize = 0;
+			}
+
+			bufferValues[bufferSize++] = valuesToAdd[i];
+			if (bufferSize == k) { // full
+				currentBufferToFill.values.setSize(bufferSize);
+				currentBufferToFill = null;
+			}
+		}
+	}
+	if (this.currentBufferToFill != null) {
+		this.currentBufferToFill.values.setSize(bufferSize);
+	}
+	
+	this.totalElementsFilled += to-from+1;
+}
+/**
+ * Not yet commented.
+ */
+protected DoubleBuffer[] buffersToCollapse() {
+	int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
+	return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+}
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+public void clear() {
+	this.totalElementsFilled = 0;
+	this.currentBufferToFill = null;
+	this.bufferSet.clear();
+}
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public Object clone() {
+	DoubleQuantileEstimator copy = (DoubleQuantileEstimator) super.clone();
+	if (this.bufferSet != null) {
+		copy.bufferSet = (DoubleBufferSet) copy.bufferSet.clone();
+		if (this.currentBufferToFill != null) {
+			 int index = new ObjectArrayList(this.bufferSet.buffers).indexOf(this.currentBufferToFill, true);
+			 copy.currentBufferToFill = copy.bufferSet.buffers[index];
+		}		
+	}
+	return copy;
+}
+/**
+ * Not yet commented.
+ */
+protected void collapse() {
+	DoubleBuffer[] toCollapse = buffersToCollapse();
+	DoubleBuffer outputBuffer = bufferSet.collapse(toCollapse);
+
+	int minLevel = toCollapse[0].level();
+	outputBuffer.level(minLevel + 1);
+
+	postCollapse(toCollapse);
+}
+/**
+ * Returns whether the specified element is contained in the receiver.
+ */
+public boolean contains(double element) {
+	return bufferSet.contains(element);
+}
+/**
+ * Applies a procedure to each element of the receiver, if any.
+ * Iterates over the receiver in no particular order.
+ * @param procedure    the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise continues. 
+ * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise. 
+ */
+public boolean forEach(org.apache.mahout.colt.function.DoubleProcedure procedure) {
+	return this.bufferSet.forEach(procedure);
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public long memory() {
+	return bufferSet.memory();
+}
+/**
+ * Not yet commented.
+ */
+protected abstract void newBuffer();
+/**
+ * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>.
+ * Does linear interpolation if the element is not contained but lies in between two contained elements.
+ *
+ * @param the element to search for.
+ * @return the percentage <tt>p</tt> of elements <tt>&lt;= element</tt> (<tt>0.0 &lt;= p &lt;=1.0)</tt>.
+ */
+public double phi(double element) {
+	return bufferSet.phi(element);
+}
+/**
+ * Not yet commented.
+ */
+protected abstract void postCollapse(DoubleBuffer[] toCollapse);
+/**
+ * Default implementation does nothing.
+ */
+protected DoubleArrayList preProcessPhis(DoubleArrayList phis) {
+	return phis;
+}
+/**
+ * Computes the specified quantile elements over the values previously added.
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0]. <tt>phis</tt> must be sorted ascending.
+ * @return the approximate quantile elements.
+ */
+public DoubleArrayList quantileElements(DoubleArrayList phis) {
+	/*
+	//check parameter
+	DoubleArrayList sortedPhiList = phis.copy();
+	sortedPhiList.sort();
+	if (! phis.equals(sortedPhiList)) {
+		throw new IllegalArgumentException("Phis must be sorted ascending.");
+	}
+	*/
+
+	//System.out.println("starting to augment missing values, if necessary...");
+
+	phis = preProcessPhis(phis);
+	
+	long[] triggerPositions = new long[phis.size()];
+	long totalSize = this.bufferSet.totalSize();
+	for (int i=phis.size(); --i >=0;) {
+		triggerPositions[i]=Utils.epsilonCeiling(phis.get(i) * totalSize)-1;
+	}
+
+	//System.out.println("triggerPositions="+org.apache.mahout.colt.Arrays.toString(triggerPositions));
+	//System.out.println("starting to determine quantiles...");
+	//System.out.println(bufferSet);
+
+	DoubleBuffer[] fullBuffers = bufferSet._getFullOrPartialBuffers();
+	double[] quantileElements = new double[phis.size()];
+
+	//do the main work: determine values at given positions in sorted sequence
+	return new DoubleArrayList(bufferSet.getValuesAtPositions(fullBuffers, triggerPositions));
+}
+/**
+ * Not yet commented.
+ */
+protected abstract boolean sampleNextElement();
+/**
+ * Initializes the receiver
+ */
+protected void setUp(int b, int k) {
+	if (!(b>=2 && k>=1)) {
+		throw new IllegalArgumentException("Assertion: b>=2 && k>=1");
+	}
+	this.bufferSet = new DoubleBufferSet(b,k);
+	this.clear();
+}
+/**
+ * Returns the number of elements currently contained in the receiver (identical to the number of values added so far).
+ */
+public long size() {
+	return totalElementsFilled;
+}
+/**
+ * Returns a String representation of the receiver.
+ */
+public String toString() {
+	String s=this.getClass().getName();
+	s = s.substring(s.lastIndexOf('.')+1);
+	int b = bufferSet.b();
+	int k = bufferSet.k();
+	return s + "(mem="+memory()+", b="+b+", k="+k+", size="+size()+", totalSize="+this.bufferSet.totalSize()+")";
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public long totalMemory() {
+	return bufferSet.b()*bufferSet.k();
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileEstimator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileFinder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileFinder.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileFinder.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileFinder.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,89 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+import org.apache.mahout.colt.list.DoubleArrayList;
+/**
+ * The interface shared by all quantile finders, no matter if they are exact or approximate.
+ * It is usually completely sufficient to operate on this interface only.
+ * Also see {@link hep.aida.bin.QuantileBin1D}, demonstrating how this package can be used.
+ */
+/** 
+ * @deprecated until unit tests are in place.  Until this time, this class/interface is unsupported.
+ */
+@Deprecated
+public interface DoubleQuantileFinder extends java.io.Serializable {
+//public interface DoubleQuantileFinder extends com.objy.db.iapp.PersistentEvents, java.io.Serializable {
+/**
+ * Adds a value to the receiver.
+ * @param value the value to add.
+ */
+public void add(double value);
+/**
+ * Adds all values of the specified list to the receiver.
+ * @param values the list of which all values shall be added.
+ */
+public void addAllOf(org.apache.mahout.colt.list.DoubleArrayList values);
+/**
+ * Adds the part of the specified list between indexes <tt>from</tt> (inclusive) and <tt>to</tt> (inclusive) to the receiver.
+ *
+ * @param values the list of which elements shall be added.
+ * @param from the index of the first element to be added (inclusive).
+ * @param to the index of the last element to be added (inclusive).
+ */
+public void addAllOfFromTo(DoubleArrayList values, int from, int to);
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+public void clear();
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public abstract Object clone();
+/**
+ * Applies a procedure to each element of the receiver, if any.
+ * Iterates over the receiver in no particular order.
+ * @param procedure    the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise continues. 
+ * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise. 
+ */
+public boolean forEach(org.apache.mahout.colt.function.DoubleProcedure procedure);
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public long memory();
+/**
+ * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>.
+ * Does linear interpolation if the element is not contained but lies in between two contained elements.
+ *
+ * Writing a wrapper is a good idea if you can think of better ways of doing interpolation.
+ * Same if you want to keep min,max and other such measures.
+ * @param the element to search for.
+ * @return the percentage <tt>p</tt> of elements <tt>&lt;= element</tt> (<tt>0.0 &lt;= p &lt;=1.0)</tt>.
+ */
+public double phi(double element);
+/**
+ * Computes the specified quantile elements over the values previously added.
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0]. <tt>phis</tt> must be sorted ascending.
+ * @return the quantile elements.
+ */
+public DoubleArrayList quantileElements(DoubleArrayList phis);
+/**
+ * Returns the number of elements currently contained in the receiver (identical to the number of values added so far).
+ */
+public long size();
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public abstract long totalMemory();
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/DoubleQuantileFinder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/EquiDepthHistogram.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/EquiDepthHistogram.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/EquiDepthHistogram.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/EquiDepthHistogram.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,136 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+/**
+ * Read-only equi-depth histogram for selectivity estimation.
+ * Assume you have collected statistics over a data set, among them a one-dimensional equi-depth histogram (quantiles).
+ * Then an applications or DBMS might want to estimate the <i>selectivity</i> of some range query <tt>[from,to]</tt>, i.e. the percentage of data set elements contained in the query range.
+ * This class does not collect equi-depth histograms but only space efficiently stores already produced histograms and provides operations for selectivity estimation.
+ * Uses linear interpolation.
+ * <p>
+ * This class stores a list <tt>l</tt> of <tt>float</tt> values for which holds:
+ * <li>Let <tt>v</tt> be a list of values (sorted ascending) an equi-depth histogram has been computed over.</li>
+ * <li>Let <tt>s=l.length</tt>.</li>
+ * <li>Let <tt>p=(0, 1/s-1), 2/s-1,..., s-1/s-1=1.0)</tt> be a list of the <tt>s</tt> percentages.</li>
+ * <li>Then for each <tt>i=0..s-1: l[i] = e : v.contains(e) && v[0],..., v[p[i]*v.length] &lt;= e</tt>.</li>
+ * <li>(In particular: <tt>l[0]=min(v)=v[0]</tt> and <tt>l[s-1]=max(v)=v[s-1]</tt>.)</li>
+ * 
+ * @author wolfgang.hoschek@cern.ch
+ * @version 1.0, 09/24/99
+ */
+/** 
+ * @deprecated until unit tests are in place.  Until this time, this class/interface is unsupported.
+ */
+@Deprecated
+public class EquiDepthHistogram extends org.apache.mahout.colt.PersistentObject {
+	protected float[] binBoundaries;
+/**
+ * Constructs an equi-depth histogram with the given quantile elements.
+ * Quantile elements must be sorted ascending and have the form specified in the class documentation.
+ */
+public EquiDepthHistogram(float[] quantileElements) {
+	this.binBoundaries = quantileElements;
+}
+/**
+ * Returns the bin index of the given element.
+ * In other words, returns a handle to the range the element falls into.
+ *
+ * @param element the element to search for.
+ * @throws java.lang.IllegalArgumentException if the element is not contained in any bin.
+ */
+public int binOfElement(float element) {
+	int index = java.util.Arrays.binarySearch(binBoundaries,element);
+	if (index>=0) {
+		// element found.
+		if (index==binBoundaries.length-1) index--; // last bin is a closed interval.
+	}
+	else {
+		// element not found.
+		index -= -1; // index = -index-1; now index is the insertion point.
+		if (index==0 || index==binBoundaries.length) {
+			throw new IllegalArgumentException("Element="+element+" not contained in any bin.");
+		}
+		index--;
+	}
+	return index;
+}
+/**
+ * Returns the number of bins. In other words, returns the number of subdomains partitioning the entire value domain.
+ */
+public int bins() {
+	return binBoundaries.length-1;
+}
+/**
+ * Returns the end of the range associated with the given bin.
+ * @throws ArrayIndexOutOfBoundsException if <tt>binIndex &lt; 0 || binIndex &gt;= bins()</tt>.
+ */
+public float endOfBin(int binIndex) {
+	return binBoundaries[binIndex+1];
+}
+/**
+ * Returns the percentage of elements in the range (from,to]. Does linear interpolation.
+ * @param from the start point (exclusive).
+ * @param to the end point (inclusive).
+ * @returns a number in the closed interval <tt>[0.0,1.0]</tt>.
+ */
+public double percentFromTo(float from, float to) {
+	return phi(to)-phi(from);
+}
+/**
+ * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>.
+ * Does linear interpolation.
+ *
+ * @param the element to search for.
+ * @returns a number in the closed interval <tt>[0.0,1.0]</tt>.
+ */
+public double phi(float element) {
+	int size = binBoundaries.length;
+	if (element<=binBoundaries[0]) return 0.0;
+	if (element>=binBoundaries[size-1]) return 1.0;
+
+	double binWidth = 1.0/(size-1);
+	int index = java.util.Arrays.binarySearch(binBoundaries, element);
+	//int index = new FloatArrayList(binBoundaries).binarySearch(element);
+	if (index>=0) { // found
+		return binWidth*index;
+	}
+
+	// do linear interpolation
+	int insertionPoint = -index-1;
+	double from = binBoundaries[insertionPoint-1];
+	double to = binBoundaries[insertionPoint]-from;
+	double p = (element - from) / to;
+	return binWidth * (p+(insertionPoint-1));
+}
+/**
+ * @deprecated
+ * Deprecated.
+ * Returns the number of bin boundaries.
+ */
+public int size() {
+	return binBoundaries.length;
+}
+/**
+ * Returns the start of the range associated with the given bin.
+ * @throws ArrayIndexOutOfBoundsException if <tt>binIndex &lt; 0 || binIndex &gt;= bins()</tt>.
+ */
+public float startOfBin(int binIndex) {
+	return binBoundaries[binIndex];
+}
+/**
+ * Not yet commented.
+ */
+public static void test(float element) {
+	float[] quantileElements =
+		{50.0f, 100.0f, 200.0f, 300.0f, 1400.0f, 1500.0f,  1600.0f, 1700.0f, 1800.0f, 1900.0f, 2000.0f};
+	EquiDepthHistogram histo = new EquiDepthHistogram(quantileElements);
+	System.out.println("elem="+element+", phi="+histo.phi(element));
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/EquiDepthHistogram.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/ExactDoubleQuantileFinder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/ExactDoubleQuantileFinder.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/ExactDoubleQuantileFinder.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/ExactDoubleQuantileFinder.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,164 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+import org.apache.mahout.colt.list.DoubleArrayList;
+/**
+ * Exact quantile finding algorithm for known and unknown <tt>N</tt> requiring large main memory; computes quantiles over a sequence of <tt>double</tt> elements.
+ * The folkore algorithm: Keeps all elements in main memory, sorts the list, then picks the quantiles.
+ *
+ * @author wolfgang.hoschek@cern.ch
+ * @version 1.0, 09/24/99
+ */
+//class ExactDoubleQuantileFinder extends Object implements DoubleQuantileFinder {
+class ExactDoubleQuantileFinder extends org.apache.mahout.colt.PersistentObject implements DoubleQuantileFinder {
+	protected DoubleArrayList buffer;
+	protected boolean isSorted;
+/**
+ * Constructs an empty exact quantile finder.
+ */
+public ExactDoubleQuantileFinder() {
+	this.buffer = new DoubleArrayList(0);
+	this.clear();
+}
+/**
+ * Adds a value to the receiver.
+ * @param value the value to add.
+ */
+public void add(double value) {
+	this.buffer.add(value);
+	this.isSorted = false;
+}
+/**
+ * Adds all values of the specified list to the receiver.
+ * @param values the list of which all values shall be added.
+ */
+public void addAllOf(DoubleArrayList values) {
+	addAllOfFromTo(values, 0, values.size()-1);
+}
+/**
+ * Adds the part of the specified list between indexes <tt>from</tt> (inclusive) and <tt>to</tt> (inclusive) to the receiver.
+ *
+ * @param values the list of which elements shall be added.
+ * @param from the index of the first element to be added (inclusive).
+ * @param to the index of the last element to be added (inclusive).
+ */
+public void addAllOfFromTo(DoubleArrayList values, int from, int to) {
+	buffer.addAllOfFromTo(values, from, to);
+	this.isSorted = false;
+}
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+public void clear() {
+	this.buffer.clear();
+	this.buffer.trimToSize();
+	this.isSorted = false;
+}
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public Object clone() {
+	ExactDoubleQuantileFinder copy = (ExactDoubleQuantileFinder) super.clone();
+	if (this.buffer != null) copy.buffer = copy.buffer.copy();
+	return copy;
+}
+/**
+ * Returns whether the specified element is contained in the receiver.
+ */
+public boolean contains(double element) {
+	this.sort();
+	return buffer.binarySearch(element)>=0;
+}
+/**
+ * Applies a procedure to each element of the receiver, if any.
+ * Iterates over the receiver in no particular order.
+ * @param procedure    the procedure to be applied. Stops iteration if the procedure returns <tt>false</tt>, otherwise continues. 
+ * @return <tt>false</tt> if the procedure stopped before all elements where iterated over, <tt>true</tt> otherwise. 
+ */
+public boolean forEach(org.apache.mahout.colt.function.DoubleProcedure procedure) {
+	double[] theElements = buffer.elements();
+	int theSize = (int) size();
+	
+	for (int i=0; i<theSize;) if (! procedure.apply(theElements[i++])) return false;
+	return true;
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public long memory() {
+	return buffer.elements().length;
+}
+/**
+ * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>.
+ * Does linear interpolation if the element is not contained but lies in between two contained elements.
+ *
+ * @param the element to search for.
+ * @return the percentage <tt>p</tt> of elements <tt>&lt;= element</tt> (<tt>0.0 &lt;= p &lt;=1.0)</tt>.
+ */
+public double phi(double element) {
+	this.sort();
+	return org.apache.mahout.jet.stat.Descriptive.rankInterpolated(buffer,element) / this.size();
+}
+/**
+ * Computes the specified quantile elements over the values previously added.
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0]. <tt>phis</tt> must be sorted ascending.
+ * @return the exact quantile elements.
+ */
+public DoubleArrayList quantileElements(DoubleArrayList phis) {
+	this.sort();
+	return org.apache.mahout.jet.stat.Descriptive.quantiles(this.buffer, phis);
+	/*
+	int bufferSize = (int) this.size();
+	double[] quantileElements = new double[phis.size()];
+	for (int i=phis.size(); --i >=0;) {
+		int rank=(int)Utils.epsilonCeiling(phis.get(i)*bufferSize) -1;
+		quantileElements[i]=buffer.get(rank);
+	}
+	return new DoubleArrayList(quantileElements);
+	*/
+}
+/**
+ * Returns the number of elements currently contained in the receiver (identical to the number of values added so far).
+ */
+public long size() {
+	return buffer.size();
+}
+/**
+ * Sorts the receiver.
+ */
+protected void sort() {
+	if (! isSorted) {
+		// IMPORTANT: TO DO : replace mergeSort with quickSort!
+		// currently it is mergeSort because JDK 1.2 can't be imported into VisualAge.
+		buffer.sort();
+		//this.buffer.mergeSort();
+		this.isSorted = true;
+	}
+}
+/**
+ * Returns a String representation of the receiver.
+ */
+public String toString() {
+	String s=this.getClass().getName();
+	s = s.substring(s.lastIndexOf('.')+1);
+	return s + "(mem="+memory()+", size="+size()+")";
+}
+/**
+ * Returns the number of elements currently needed to store all contained elements.
+ * This number usually differs from the results of method <tt>size()</tt>, according to the underlying datastructure.
+ */
+public long totalMemory() {
+	return memory();
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/ExactDoubleQuantileFinder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,261 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+import org.apache.mahout.colt.list.DoubleArrayList;
+import org.apache.mahout.jet.math.Arithmetic;
+import org.apache.mahout.jet.random.engine.RandomEngine;
+import org.apache.mahout.jet.random.sampling.RandomSamplingAssistant;
+/**
+ * Approximate quantile finding algorithm for known <tt>N</tt> requiring only one pass and little main memory; computes quantiles over a sequence of <tt>double</tt> elements.
+ *
+ * <p>Needs as input the following parameters:<p>
+ * <dt>1. <tt>N</tt> - the number of values of the data sequence over which quantiles are to be determined.
+ * <dt>2. <tt>quantiles</tt> - the number of quantiles to be computed.
+ * <dt>3. <tt>epsilon</tt> - the allowed approximation error on quantiles. The approximation guarantee of this algorithm is explicit.
+ *
+ * <p>It is also possible to couple the approximation algorithm with random sampling to further reduce memory requirements. 
+ * With sampling, the approximation guarantees are explicit but probabilistic, i.e. they apply with respect to a (user controlled) confidence parameter "delta".
+ *
+ * <dt>4. <tt>delta</tt> - the probability allowed that the approximation error fails to be smaller than epsilon. Set <tt>delta</tt> to zero for explicit non probabilistic guarantees.
+ *
+ * You usually don't instantiate quantile finders by using the constructor. Instead use the factory <tt>QuantileFinderFactor</tt> to do so. It will set up the right parametrization for you.
+ * 
+ * <p>After Gurmeet Singh Manku, Sridhar Rajagopalan and Bruce G. Lindsay, 
+ * Approximate Medians and other Quantiles in One Pass and with Limited Memory,
+ * Proc. of the 1998 ACM SIGMOD Int. Conf. on Management of Data,
+ * Paper available <A HREF="http://www-cad.eecs.berkeley.edu/~manku"> here</A>.
+ *
+ * @author wolfgang.hoschek@cern.ch
+ * @version 1.0, 09/24/99
+ * @see QuantileFinderFactory
+ * @see UnknownApproximateDoubleQuantileFinder
+ */
+class KnownDoubleQuantileEstimator extends DoubleQuantileEstimator {
+	protected double beta; //correction factor for phis
+
+	protected boolean weHadMoreThanOneEmptyBuffer;
+
+	protected RandomSamplingAssistant samplingAssistant;
+	protected double samplingRate; // see method sampleNextElement()
+	protected long N; // see method sampleNextElement()
+/**
+ * Constructs an approximate quantile finder with b buffers, each having k elements.
+ * @param b the number of buffers
+ * @param k the number of elements per buffer
+ * @param N the total number of elements over which quantiles are to be computed.
+ * @param samplingRate 1.0 --> all elements are consumed. 10.0 --> Consumes one random element from successive blocks of 10 elements each. Etc.
+ * @param generator a uniform random number generator.
+ */
+public KnownDoubleQuantileEstimator(int b, int k, long N, double samplingRate, RandomEngine generator) {
+	this.samplingRate = samplingRate;
+	this.N = N;
+
+	if (this.samplingRate <= 1.0) {
+		this.samplingAssistant = null;
+	}
+	else {
+		this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, generator);
+	}
+	
+	setUp(b,k);
+	this.clear();
+}
+/**
+ * @param infinities the number of infinities to fill.
+ * @param buffer the buffer into which the infinities shall be filled.
+ */
+protected void addInfinities(int missingInfinities,  DoubleBuffer buffer) {
+	RandomSamplingAssistant oldAssistant = this.samplingAssistant;
+	this.samplingAssistant = null; // switch off sampler
+	//double[] infinities = new double[missingInfinities];
+	
+	boolean even=true;
+	for (int i=0; i<missingInfinities; i++) {
+		if (even) buffer.values.add(Double.MAX_VALUE);
+		else	  buffer.values.add(-Double.MAX_VALUE);
+		
+		//if (even) {infinities[i]=Double.MAX_VALUE;}
+		//else	  {infinities[i]=-Double.MAX_VALUE;}
+		
+		//if (even) {this.add(Double.MAX_VALUE);}
+		//else	  {this.add(-Double.MAX_VALUE);}
+		even = !even;
+	}
+	
+	//buffer.values.addAllOfFromTo(new DoubleArrayList(infinities),0,missingInfinities-1);
+	
+	//this.totalElementsFilled -= infinities;
+	
+	this.samplingAssistant = oldAssistant; // switch on sampler again
+}
+/**
+ * Not yet commented.
+ */
+protected DoubleBuffer[] buffersToCollapse() {
+	int minLevel = bufferSet._getMinLevelOfFullOrPartialBuffers();
+	return bufferSet._getFullOrPartialBuffersWithLevel(minLevel);
+}
+/**
+ * Removes all elements from the receiver.  The receiver will
+ * be empty after this call returns, and its memory requirements will be close to zero.
+ */
+public void clear() {
+	super.clear();
+	this.beta=1.0;
+	this.weHadMoreThanOneEmptyBuffer = false;
+	//this.setSamplingRate(samplingRate,N);
+
+	RandomSamplingAssistant assist = this.samplingAssistant;
+	if (assist != null) {
+		this.samplingAssistant = new RandomSamplingAssistant(Arithmetic.floor(N/samplingRate), N, assist.getRandomGenerator());
+	}
+}
+/**
+ * Returns a deep copy of the receiver.
+ *
+ * @return a deep copy of the receiver.
+ */
+public Object clone() {
+	KnownDoubleQuantileEstimator copy = (KnownDoubleQuantileEstimator) super.clone();
+	if (this.samplingAssistant != null) copy.samplingAssistant = (RandomSamplingAssistant) copy.samplingAssistant.clone();
+	return copy;
+}
+/**
+ * Not yet commented.
+ */
+protected void newBuffer() {
+	int numberOfEmptyBuffers = this.bufferSet._getNumberOfEmptyBuffers();
+	//DoubleBuffer[] emptyBuffers = this.bufferSet._getEmptyBuffers();
+	if (numberOfEmptyBuffers==0) throw new RuntimeException("Oops, no empty buffer.");
+
+	this.currentBufferToFill = this.bufferSet._getFirstEmptyBuffer();
+	if (numberOfEmptyBuffers==1 && ! this.weHadMoreThanOneEmptyBuffer) {
+		this.currentBufferToFill.level(this.bufferSet._getMinLevelOfFullOrPartialBuffers());
+	}
+	else {
+		this.weHadMoreThanOneEmptyBuffer = true;
+		this.currentBufferToFill.level(0);
+		/*
+		for (int i=0; i<emptyBuffers.length; i++) {
+			emptyBuffers[i].level = 0;			
+		}
+		*/
+	}
+	//currentBufferToFill.state = DoubleBuffer.PARTIAL;
+	this.currentBufferToFill.weight(1);
+}
+/**
+ * Not yet commented.
+ */
+protected void postCollapse(DoubleBuffer[] toCollapse) {
+	this.weHadMoreThanOneEmptyBuffer = false;
+}
+/**
+ */
+protected DoubleArrayList preProcessPhis(DoubleArrayList phis) {
+	if (beta>1.0) {
+		phis = phis.copy();
+		for (int i=phis.size(); --i >=0;) {
+			phis.set(i, (2*phis.get(i) + beta - 1) / (2*beta));
+		}		
+	}
+	return phis;
+}
+/**
+ * Computes the specified quantile elements over the values previously added.
+ * @param phis the quantiles for which elements are to be computed. Each phi must be in the interval [0.0,1.0]. <tt>phis</tt> must be sorted ascending.
+ * @return the approximate quantile elements.
+ */
+public DoubleArrayList quantileElements(DoubleArrayList phis) {
+	/*
+	* The KNOWN quantile finder reads off quantiles from FULL buffers only.
+	* Since there might be a partially full buffer, this method first satisfies this constraint by temporarily filling a few +infinity, -infinity elements to make up a full block.
+	* This is in full conformance with the explicit approximation guarantees.
+ 	*
+ 	* For those of you working on online apps:
+  	* The approximation guarantees are given for computing quantiles AFTER N elements have been filled, not for intermediate displays.
+	* If you have one thread filling and another thread displaying concurrently, you will note that in the very beginning the infinities will dominate the display.
+ 	* This could confuse users, because, of course, they don't expect any infinities, even if they "disappear" after a short while.
+ 	* To prevent panic exclude phi's close to zero or one in the early phases of processing.
+  	*/
+	DoubleBuffer partial = this.bufferSet._getPartialBuffer();
+	int missingValues = 0;
+	if (partial != null) { // any auxiliary infinities needed?
+		missingValues = bufferSet.k() - partial.size();
+		if (missingValues <= 0) throw new RuntimeException("Oops! illegal missing values.");
+
+		//System.out.println("adding "+missingValues+" infinity elements...");
+		this.addInfinities(missingValues,partial);
+
+		//determine beta (N + Infinity values = beta * N)
+		this.beta = (this.totalElementsFilled + missingValues) / (double)this.totalElementsFilled;
+		}
+	else {
+		this.beta=1.0; 
+	}
+	 
+	DoubleArrayList quantileElements = super.quantileElements(phis);
+
+	// restore state we were in before.
+	// remove the temporarily added infinities.
+	if (partial != null) removeInfinitiesFrom(missingValues, partial);
+
+	// now you can continue filling the remaining values, if any.
+	return quantileElements;
+}
+/**
+ * Reading off quantiles requires to fill some +infinity, -infinity values to make a partial buffer become full.
+ *
+ * This method removes the infinities which were previously temporarily added to a partial buffer.
+ * Removing them is necessary if we want to continue filling more elements.
+ * Precondition: the buffer is sorted ascending.
+ * @param infinities the number of infinities previously filled.
+ * @param buffer the buffer into which the infinities were filled.
+ */
+protected void removeInfinitiesFrom(int infinities, DoubleBuffer buffer) {
+	int plusInf = 0;
+	int minusInf = 0;
+	// count them (this is not very clever but it's safe)
+	boolean even=true;
+	for (int i=0; i<infinities; i++) {
+		if (even) plusInf++;
+		else	  minusInf++;
+		even = !even;
+	}
+
+	buffer.values.removeFromTo(buffer.size()-plusInf, buffer.size()-1);
+	buffer.values.removeFromTo(0, minusInf-1);
+	//this.totalElementsFilled -= infinities;
+}
+/**
+ * Not yet commented.
+ */
+protected boolean sampleNextElement() {
+	if (samplingAssistant == null) return true;
+
+	/*
+	 * This is a KNOWN N quantile finder!
+	 * One should not try to fill more than N elements,
+	 * because otherwise we can't give explicit approximation guarantees anymore.
+	 * Use an UNKNOWN quantile finder instead if your app may fill more than N elements.
+	 *
+	 * However, to make this class meaningful even under wired use cases, we actually do allow to fill more than N elements (without explicit approx. guarantees, of course).
+	 * Normally, elements beyond N will not get sampled because the sampler is exhausted. 
+	 * Therefore the histogram will no more change no matter how much you fill.
+	 * This might not be what the user expects.
+	 * Therefore we use a new (unexhausted) sampler with the same parametrization.
+	 *
+	 * If you want this class to ignore any elements beyong N, then comment the following line.
+	 */
+	//if ((totalElementsFilled-1) % N == 0) setSamplingRate(samplingRate, N); // delete if appropriate
+
+	return samplingAssistant.sampleNextElement();
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/KnownDoubleQuantileEstimator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,100 @@
+package org.apache.mahout.jet.stat.quantile;
+
+
+/**
+ * A class to test the QuantileBin1D code.
+ * The command line is "java Quantile1Test numExamples N"
+ * where numExamples is the number of random (Gaussian) numbers to
+ * be presented to the QuantileBin1D.add method, and N is
+ * the absolute maximum number of examples the QuantileBin1D is setup
+ * to receive in the constructor.  N can be set to "L", which will use
+ * Long.MAX_VALUE, or to "I", which will use Integer.MAX_VALUE, or to 
+ * any positive long value.
+ */
+/** 
+ * @deprecated until unit tests are in place.  Until this time, this class/interface is unsupported.
+ */
+@Deprecated
+public class Quantile1Test {
+    public static void main(String[] argv) {
+	/*
+	 * Get the number of examples from the first argument
+	 */
+	int numExamples = 0;
+	try {
+	    numExamples = Integer.parseInt(argv[0]);
+	}
+	catch (Exception e) {
+	    System.err.println("Unable to parse input line count argument");
+	    System.err.println(e.getMessage());
+	    System.exit(1);
+	}
+	System.out.println("Got numExamples=" + numExamples);
+
+	/*
+	 * Get N from the second argument
+	 */
+	long N = 0;
+	try {
+	    if (argv[1].equals("L")) {
+		N = Long.MAX_VALUE;
+	    } else if (argv[1].equals("I")) {
+		N = (long)Integer.MAX_VALUE;
+	    } else {
+		N = Long.parseLong(argv[1]);
+	    }
+	}
+	catch (Exception e) {
+	    System.err.println("Error parsing flag for N");
+	    System.err.println(e.getMessage());
+	    System.exit(1);
+	}
+	System.out.println("Got N=" + N);
+
+	/*
+	 * Set up the QuantileBin1D object
+	 *
+	DRand rand = new DRand(new Date());
+	QuantileBin1D qAccum = new QuantileBin1D(false,
+						 N,
+						 1.e-4,
+						 1.e-3,
+						 200,
+						 rand,
+						 false,
+						 false,
+						 2);
+
+	DynamicBin1D dbin = new DynamicBin1D();
+	
+	*
+	 * Use a new random number generator to generate numExamples
+	 * random gaussians, and add them to the QuantileBin1D
+	 *
+	Uniform dataRand = new Uniform(new DRand(7757));
+	for (int i = 1; i <= numExamples; i++) {
+		double gauss = dataRand.nextDouble();
+	    qAccum.add(gauss);
+	    dbin.add(gauss);
+	}
+	
+	*
+	 * print out the percentiles
+	 *
+	DecimalFormat fmt = new DecimalFormat("0.00");
+	System.out.println();
+	//int step = 1;
+	int step = 10;
+	for (int i = 1; i < 100; ) {
+	    double percent = ((double)i) * 0.01;
+	    double quantile = qAccum.quantile(percent);
+	    System.out.println(fmt.format(percent) + "  " + quantile + ",  " + dbin.quantile(percent)+ ",  " + (dbin.quantile(percent)-quantile));
+	    i = i + step;
+	    *
+	}
+	*/
+}
+}
+
+
+

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/Quantile1Test.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java?rev=883365&view=auto
==============================================================================
--- lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java (added)
+++ lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java Mon Nov 23 15:14:26 2009
@@ -0,0 +1,449 @@
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.jet.stat.quantile;
+
+/**
+ * Computes b and k vor various parameters.
+ */ 
+class QuantileCalc extends Object {
+/**
+ * Efficiently computes the binomial coefficient, often also referred to as "n over k" or "n choose k".
+ * The binomial coefficient is defined as n!/((n-k)!*k!).
+ * Tries to avoid numeric overflows. 
+ * @return the binomial coefficient.
+ */
+public static double binomial(long n, long k) {	
+	if (k==0 || k==n) {return 1.0;}
+
+	// since binomial(n,k)==binomial(n,n-k), we can enforce the faster variant,
+	// which is also the variant minimizing number overflows.
+	if (k>n/2.0) k=n-k;
+	
+	double binomial=1.0;
+	long N=n-k+1;
+	for (long i=k; i>0; ) {
+		binomial *= ((double) N++) / (double) (i--);
+	}
+	return binomial;
+}
+/**
+ * Returns the smallest <code>long &gt;= value</code>.
+ * <dt>Examples: <code>1.0 -> 1, 1.2 -> 2, 1.9 -> 2</code>.
+ * This method is safer than using (long) Math.ceil(value), because of possible rounding error.
+ */
+public static long ceiling(double value) {
+	return Math.round(Math.ceil(value));
+}
+/**
+ * Computes the number of buffers and number of values per buffer such that
+ * quantiles can be determined with an approximation error no more than epsilon with a certain probability.
+ *
+ * Assumes that quantiles are to be computed over N values.
+ * The required sampling rate is computed and stored in the first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ *
+ * @param N the number of values over which quantiles shall be computed (e.g <tt>10^6</tt>).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+ * @param samplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+public static long[] known_N_compute_B_and_K(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
+	if (delta > 0.0) {
+		return known_N_compute_B_and_K_slow(N, epsilon, delta, quantiles, returnSamplingRate);
+	}
+	returnSamplingRate[0] = 1.0;
+	return known_N_compute_B_and_K_quick(N, epsilon);
+}
+/**
+ * Computes the number of buffers and number of values per buffer such that
+ * quantiles can be determined with a <b>guaranteed</b> approximation error no more than epsilon.
+ * Assumes that quantiles are to be computed over N values.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer.
+ * @param N the anticipated number of values over which quantiles shall be determined.
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ */
+protected static long[] known_N_compute_B_and_K_quick(long N, double epsilon) {
+	if (epsilon<=0.0) {
+		// no way around exact quantile search
+		long[] result = new long[2];
+		result[0]=1;
+		result[1]=N;
+		return result;
+	}
+
+	final int maxBuffers = 50;
+	final int maxHeight = 50;
+	final double N_double = (double) N;
+	final double c = N_double * epsilon * 2.0;
+	int[] heightMaximums = new int[maxBuffers-1];
+
+	// for each b, determine maximum height, i.e. the height for which x<=0 and x is a maximum
+	// with x = binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) - N * epsilon * 2.0
+	for (int b=2; b<=maxBuffers; b++) {
+		int h = 3;
+		
+		while ( h<=maxHeight && // skip heights until x<=0
+				(h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+				((double) Math.round(binomial(b+h-3, h-3))) +
+				((double) Math.round(binomial(b+h-3, h-2))) - c
+				> 0.0
+			  ) {h++;}
+		//from now on x is monotonically growing...
+		while ( h<=maxHeight && // skip heights until x>0
+				(h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+				((double) Math.round(binomial(b+h-3, h-3))) +
+				((double) Math.round(binomial(b+h-3, h-2))) - c
+				<= 0.0
+			  ) {h++;}
+		h--; //go back to last height
+
+		// was x>0 or did we loop without finding anything?
+		int hMax;
+		if (h>=maxHeight && 
+				(h-2) * ((double)Math.round(binomial(b+h-2, h-1))) -
+				((double) Math.round(binomial(b+h-3, h-3))) +
+				((double) Math.round(binomial(b+h-3, h-2))) - c
+				> 0.0) {
+			hMax=Integer.MIN_VALUE;
+		}
+		else {
+			hMax=h;
+		}
+				
+		heightMaximums[b-2]=hMax; //safe some space
+	} //end for
+
+
+	// for each b, determine the smallest k satisfying the constraints, i.e.
+	// for each b, determine kMin, with kMin = N/binomial(b+hMax-2,hMax-1)
+	long[] kMinimums = new long[maxBuffers-1];
+	for (int b=2; b<=maxBuffers; b++) {
+		int h=heightMaximums[b-2];
+		long kMin=Long.MAX_VALUE;
+		if (h>Integer.MIN_VALUE) {
+			double value = ((double)Math.round(binomial(b+h-2, h-1)));
+			long tmpK=ceiling(N_double/value);
+			if (tmpK<=Long.MAX_VALUE) {
+				kMin = tmpK;
+			}
+		}
+		kMinimums[b-2]=kMin;
+	}
+
+	// from all b's, determine b that minimizes b*kMin
+	long multMin = Long.MAX_VALUE;
+	int minB = -1;
+	for (int b=2; b<=maxBuffers; b++) {
+		if (kMinimums[b-2]<Long.MAX_VALUE) {
+			long mult = ((long)b) * ((long)kMinimums[b-2]);
+			if (mult<multMin) {
+				multMin=mult;
+				minB = b;
+			}
+		}
+	}
+
+	long b, k;
+	if (minB != -1) { // epsilon large enough?
+		b = minB;
+		k = kMinimums[minB-2];
+	}
+	else {     // epsilon is very small or zero.
+		b = 1; // the only possible solution without violating the 
+		k = N; // approximation guarantees is exact quantile search.
+	}
+
+	long[] result = new long[2];
+	result[0]=b;
+	result[1]=k;
+	return result;
+}
+/**
+ * Computes the number of buffers and number of values per buffer such that
+ * quantiles can be determined with an approximation error no more than epsilon with a certain probability.
+ * Assumes that quantiles are to be computed over N values.
+ * The required sampling rate is computed and stored in the first element of the provided <tt>returnSamplingRate</tt> array, which, therefore must be at least of length 1.
+ * @param N the anticipated number of values over which quantiles shall be computed (e.g 10^6).
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact result, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To avoid probabilistic answers, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+ * @param samplingRate a <tt>double[1]</tt> where the sampling rate is to be filled in.
+ * @return <tt>long[2]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>returnSamplingRate[0]</tt>=the required sampling rate.
+ */
+protected static long[] known_N_compute_B_and_K_slow(long N, double epsilon, double delta, int quantiles, double[] returnSamplingRate) {
+	// delta can be set to zero, i.e., all quantiles should be approximate with probability 1	
+	if (epsilon<=0.0) {
+		// no way around exact quantile search
+		long[] result = new long[2];
+		result[0]=1;
+		result[1]=N;
+		returnSamplingRate[0]=1.0;
+		return result;
+	}
+
+
+	final int maxBuffers = 50;
+	final int maxHeight = 50;
+	final double N_double = N;
+
+	// One possibility is to use one buffer of size N
+	//
+	long ret_b = 1;
+	long ret_k = N;
+	double sampling_rate = 1.0;
+	long memory = N;
+
+
+	// Otherwise, there are at least two buffers (b >= 2)
+	// and the height of the tree is at least three (h >= 3)
+	//
+	// We restrict the search for b and h to MAX_BINOM, a large enough value for
+	// practical values of    epsilon >= 0.001   and    delta >= 0.00001
+	//
+	final double logarithm = Math.log(2.0*quantiles/delta);
+	final double c = 2.0 * epsilon * N_double;
+	for (long b=2 ; b<maxBuffers ; b++)
+		for (long h=3 ; h<maxHeight ; h++) {
+			double binomial = binomial(b+h-2, h-1);
+			long tmp = ceiling(N_double / binomial);
+			if ((b * tmp < memory) && 
+					((h-2) * binomial - binomial(b+h-3, h-3) + binomial(b+h-3, h-2)
+					<= c) ) {
+				ret_k = tmp ;
+				ret_b = b ;
+				memory = ret_k * b;
+				sampling_rate = 1.0 ;
+			}
+			if (delta > 0.0) {
+				double t = (h-2) * binomial(b+h-2, h-1) - binomial(b+h-3, h-3) + binomial(b+h-3, h-2) ;
+				double u = logarithm / epsilon ;
+				double v = binomial (b+h-2, h-1) ;
+				double w = logarithm / (2.0*epsilon*epsilon) ;
+
+				// From our SIGMOD 98 paper, we have two equantions to satisfy:
+				// t  <= u * alpha/(1-alpha)^2
+				// kv >= w/(1-alpha)^2
+				//
+				// Denoting 1/(1-alpha)    by x,
+				// we see that the first inequality is equivalent to
+				// t/u <= x^2 - x
+				// which is satisfied by x >= 0.5 + 0.5 * sqrt (1 + 4t/u)
+				// Plugging in this value into second equation yields
+				// k >= wx^2/v
+
+				double x = 0.5 + 0.5 * Math.sqrt(1.0 + 4.0*t/u) ;
+				long k = ceiling(w*x*x/v) ;
+				if (b * k < memory) {
+					ret_k = k ;
+					ret_b = b ;
+					memory = b * k ;
+					sampling_rate = N_double*2.0*epsilon*epsilon / logarithm ;
+				}
+			}
+		}
+		
+	long[] result = new long[2];
+	result[0]=ret_b;
+	result[1]=ret_k;
+	returnSamplingRate[0]=sampling_rate;
+	return result;
+}
+public static void main(String[] args) {
+	test_B_and_K_Calculation(args);
+}
+/**
+ * Computes b and k for different parameters.
+ */
+public static void test_B_and_K_Calculation(String[] args) {
+	boolean known_N;
+	if (args==null) known_N = false;
+	else known_N = new Boolean(args[0]).booleanValue();
+
+	int[] quantiles = {1,1000};
+	
+	long[] sizes = {100000, 1000000, 10000000, 1000000000};
+	
+	double[] deltas = {0.0, 0.001, 0.0001, 0.00001};
+	
+	double[] epsilons = {0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0000001};
+
+
+	
+	if (! known_N) sizes = new long[] {0};
+	System.out.println("\n\n");
+	if (known_N) 
+		System.out.println("Computing b's and k's for KNOWN N");
+	else 
+		System.out.println("Computing b's and k's for UNKNOWN N");
+	System.out.println("mem [elements/1024]");
+	System.out.println("***********************************");
+
+	for (int q=0; q<quantiles.length; q++) {
+		int p = quantiles[q];
+		System.out.println("------------------------------");
+		System.out.println("computing for p = "+p);
+		for (int s=0; s<sizes.length; s++) {
+			long N = sizes[s];
+			System.out.println("   ------------------------------");
+			System.out.println("   computing for N = "+N);
+			for (int d=0; d<deltas.length; d++) {
+				double delta = deltas[d];
+				System.out.println("      ------------------------------");
+				System.out.println("      computing for delta = "+delta);
+				for (int e=0; e<epsilons.length; e++) {
+					double epsilon = epsilons[e];
+					
+					double[] returnSamplingRate = new double[1];
+					long[] result;
+					if (known_N) {
+						result = known_N_compute_B_and_K(N, epsilon, delta, p, returnSamplingRate);
+					}
+					else {
+						result = unknown_N_compute_B_and_K(epsilon, delta, p);
+					}
+
+					long b = result[0];
+					long k = result[1];
+					System.out.print("         (e,d,N,p)=("+epsilon+","+delta+","+N+","+p+") --> ");
+					System.out.print("(b,k,mem");
+					if (known_N) System.out.print(",sampling");
+					System.out.print(")=("+b+","+k+","+(b*k/1024));
+					if (known_N) System.out.print(","+returnSamplingRate[0]);
+					System.out.println(")");
+				}
+			}
+		}
+	}
+
+}
+/**
+ * Computes the number of buffers and number of values per buffer such that
+ * quantiles can be determined with an approximation error no more than epsilon with a certain probability.
+ *
+ * @param epsilon the approximation error which is guaranteed not to be exceeded (e.g. <tt>0.001</tt>) (<tt>0 &lt;= epsilon &lt;= 1</tt>). To get exact results, set <tt>epsilon=0.0</tt>;
+ * @param delta the probability that the approximation error is more than than epsilon (e.g. <tt>0.0001</tt>) (<tt>0 &lt;= delta &lt;= 1</tt>). To get exact results, set <tt>delta=0.0</tt>.
+ * @param quantiles the number of quantiles to be computed (e.g. <tt>100</tt>) (<tt>quantiles &gt;= 1</tt>). If unknown in advance, set this number large, e.g. <tt>quantiles &gt;= 10000</tt>.
+ * @return <tt>long[3]</tt> - <tt>long[0]</tt>=the number of buffers, <tt>long[1]</tt>=the number of elements per buffer, <tt>long[2]</tt>=the tree height where sampling shall start.
+ */
+public static long[] unknown_N_compute_B_and_K(double epsilon, double delta, int quantiles) {
+	// delta can be set to zero, i.e., all quantiles should be approximate with probability 1	
+	if (epsilon<=0.0 || delta<=0.0) {
+		// no way around exact quantile search
+		long[] result = new long[3];
+		result[0]=1;
+		result[1]=Long.MAX_VALUE;
+		result[2]=Long.MAX_VALUE;
+		return result;
+	}
+
+	int max_b = 50;
+	int max_h = 50;
+	int max_H = 50;
+	int max_Iterations = 2;
+
+	long best_b = Long.MAX_VALUE;
+	long best_k = Long.MAX_VALUE;
+	long best_h = Long.MAX_VALUE;
+	long best_memory = Long.MAX_VALUE;
+
+	double pow = Math.pow(2.0, max_H);
+	double logDelta =  Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon);
+	//double logDelta =  Math.log(2.0/(quantiles*delta)) / (2.0*epsilon*epsilon);
+
+	while (best_b == Long.MAX_VALUE && max_Iterations-- > 0) { //until we find a solution
+		// identify that combination of b and h that minimizes b*k.
+		// exhaustive search.
+		for (int b=2; b<=max_b; b++) {
+			for (int h=2; h<=max_h; h++) {
+				double Ld = binomial(b+h-2, h-1);
+				double Ls = binomial(b+h-3,h-1);
+				
+				// now we have k>=c*(1-alpha)^-2.
+				// let's compute c.
+				//double c = Math.log(2.0/(delta/quantiles)) / (2.0*epsilon*epsilon*Math.min(Ld, 8.0*Ls/3.0));
+				double c = logDelta / Math.min(Ld, 8.0*Ls/3.0);
+
+				// now we have k>=d/alpha.
+				// let's compute d.				
+				double beta = Ld/Ls;
+				double cc = (beta-2.0)*(max_H-2.0) / (beta + pow - 2.0);
+				double d = (h+3+cc) / (2.0*epsilon);
+				
+				/*
+				double d = (Ld*(h+max_H-1.0)  +  Ls*((h+1)*pow - 2.0*(h+max_H)))   /   (Ld + Ls*(pow-2.0));
+				d = (d + 2.0) / (2.0*epsilon);
+				*/
+
+				// now we have c*(1-alpha)^-2 == d/alpha.
+				// we solve this equation for alpha yielding two solutions
+				// alpha_1,2 = (c + 2*d  +-  Sqrt(c*c + 4*c*d))/(2*d)				
+				double f = c*c + 4.0*c*d;
+				if (f<0.0) continue; // non real solution to equation
+				double root = Math.sqrt(f);
+				double alpha_one = (c + 2.0*d + root)/(2.0*d);
+				double alpha_two = (c + 2.0*d - root)/(2.0*d);
+
+				// any alpha must satisfy 0<alpha<1 to yield valid solutions
+				boolean alpha_one_OK = false;
+				boolean alpha_two_OK = false;
+				if (0.0 < alpha_one && alpha_one < 1.0) alpha_one_OK = true;
+				if (0.0 < alpha_two && alpha_two < 1.0) alpha_two_OK = true;
+				if (alpha_one_OK || alpha_two_OK) {
+					double alpha = alpha_one;
+					if (alpha_one_OK && alpha_two_OK) {
+						// take the alpha that minimizes d/alpha
+						alpha = Math.max(alpha_one, alpha_two);
+					}
+					else if (alpha_two_OK) {
+						alpha = alpha_two;
+					}
+
+					// now we have k=Ceiling(Max(d/alpha, (h+1)/(2*epsilon)))
+					long k = ceiling(Math.max(d/alpha, (h+1)/(2.0*epsilon)));
+					if (k>0) { // valid solution?
+						long memory = b*k;
+						if (memory < best_memory) { 
+							// found a solution requiring less memory
+							best_k = k;
+							best_b = b;
+							best_h = h;
+							best_memory = memory;
+						}
+					}
+				}
+			} //end for h
+		} //end for b
+		
+		if (best_b == Long.MAX_VALUE) {
+			System.out.println("Warning: Computing b and k looks like a lot of work!");
+			// no solution found so far. very unlikely. Anyway, try again.
+			max_b *= 2;
+			max_h *= 2;
+			max_H *= 2;
+		}
+	} //end while
+
+	long[] result = new long[3];
+	if (best_b == Long.MAX_VALUE) {
+		// no solution found.
+		// no way around exact quantile search.
+		result[0]=1;
+		result[1]=Long.MAX_VALUE;
+		result[2]=Long.MAX_VALUE;
+	}
+	else {
+		result[0]=best_b;
+		result[1]=best_k;
+		result[2]=best_h;
+	}
+
+	return result;
+}
+}

Propchange: lucene/mahout/trunk/matrix/src/main/java/org/apache/mahout/jet/stat/quantile/QuantileCalc.java
------------------------------------------------------------------------------
    svn:eol-style = native