You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/09/12 21:32:13 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-382] Performance sparse matrix read (avoid reallocations)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master adc4a5b6f -> deed646e3


[SYSTEMML-382] Performance sparse matrix read (avoid reallocations)

On OpenJDK 8, we observed severe performance issues when reading large
sparse matrices due to excessive GC caused by repeated sparse row
re-allocations. This patch introduces a special case for single-block
appends to sparse matrices which avoids re-allocations via full row
copies. For reading the imagenet dataset, this improved performance from
77.9s (with 75s GC) to 3.3s.  

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/538d2713
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/538d2713
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/538d2713

Branch: refs/heads/master
Commit: 538d27136f314a74ad276414f6880618fbbac3ab
Parents: adc4a5b
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Mon Sep 12 19:51:26 2016 +0200
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Mon Sep 12 23:31:02 2016 +0200

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/MatrixBlock.java     | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/538d2713/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 14a409b..73b5d25 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -756,19 +756,24 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		
 		if( that.sparse ) //SPARSE <- SPARSE
 		{
+			SparseBlock b = that.sparseBlock;
 			for( int i=0; i<that.rlen; i++ )
 			{
-				SparseBlock b = that.sparseBlock;
-				if( !b.isEmpty(i) ) {
-					int aix = rowoffset+i;
+				if( b.isEmpty(i) ) continue;
+				int aix = rowoffset+i;
+					
+				//single block append (avoid re-allocations)
+				if( sparseBlock.isEmpty(aix) && coloffset==0 ) { 
+					sparseBlock.set(aix, b.get(i), true);
+				}
+				else { //general case
 					int pos = b.pos(i);
 					int len = b.size(i);
 					int[] ix = b.indexes(i);
 					double[] val = b.values(i);
-					
-					sparseBlock.allocate(aix, estimatedNNzsPerRow,clen);
+					sparseBlock.allocate(aix, estimatedNNzsPerRow, clen);
 					for( int j=pos; j<pos+len; j++ )
-						sparseBlock.append(aix, coloffset+ix[j], val[j]);		
+						sparseBlock.append(aix, coloffset+ix[j], val[j]);	
 				}
 			}
 		}


[2/2] incubator-systemml git commit: [SYSTEMML-816] Performance row aggregates over compressed matrices

Posted by mb...@apache.org.
[SYSTEMML-816] Performance row aggregates over compressed matrices

This patch introduces a more efficient way of computing row aggregates
(e.g., rowSums, rowMins, etc) over compressed matrices. The new approach
does not need a temporary output vector per thread but parallelizes over
disjoint row partitions. Furthermore, this patch also includes
additional tests and various smaller fixes for (1) computing the nnz per
row of OLE groups, and (2) unnecessary stdout debug output.  

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/deed646e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/deed646e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/deed646e

Branch: refs/heads/master
Commit: deed646e3cc10f0d1e452cb54185b9b46e280096
Parents: 538d271
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Mon Sep 12 23:27:21 2016 +0200
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Mon Sep 12 23:31:07 2016 +0200

----------------------------------------------------------------------
 .../sysml/runtime/compress/ColGroupBitmap.java  |  12 +
 .../sysml/runtime/compress/ColGroupOLE.java     |  74 +-
 .../sysml/runtime/compress/ColGroupRLE.java     |  88 +-
 .../runtime/compress/CompressedMatrixBlock.java |  63 +-
 .../compress/LargeParUnaryAggregateTest.java    | 984 +++++++++++++++++++
 5 files changed, 1149 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/deed646e/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
index d939d3f..bc57f3d 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
@@ -33,6 +33,7 @@ import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysml.runtime.functionobjects.Builtin;
 import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 
 
@@ -302,6 +303,17 @@ public abstract class ColGroupBitmap extends ColGroup
 	
 	/**
 	 * 
+	 * @param op
+	 * @param result
+	 * @param rl
+	 * @param ru
+	 * @throws DMLRuntimeException
+	 */
+	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru)
+		throws DMLRuntimeException;
+	
+	/**
+	 * 
 	 * @param bitmapIx
 	 * @return
 	 */

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/deed646e/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
index b31cb74..fcfecde 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -432,6 +432,13 @@ public class ColGroupOLE extends ColGroupBitmap
 	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result) 
 		throws DMLRuntimeException 
 	{
+		unaryAggregateOperations(op, result, 0, getNumRows());
+	}
+	
+	@Override
+	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru) 
+		throws DMLRuntimeException 
+	{
 		//sum and sumsq (reduceall/reducerow over tuples and counts)
 		if( op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq ) 
 		{
@@ -441,7 +448,7 @@ public class ColGroupOLE extends ColGroupBitmap
 			if( op.indexFn instanceof ReduceAll )
 				computeSum(result, kplus);
 			else if( op.indexFn instanceof ReduceCol )
-				computeRowSums(result, kplus);
+				computeRowSums(result, kplus, rl, ru);
 			else if( op.indexFn instanceof ReduceRow )
 				computeColSums(result, kplus);
 		}
@@ -455,7 +462,7 @@ public class ColGroupOLE extends ColGroupBitmap
 			if( op.indexFn instanceof ReduceAll )
 				computeMxx(result, builtin);
 			else if( op.indexFn instanceof ReduceCol )
-				computeRowMxx(result, builtin);
+				computeRowMxx(result, builtin, rl, ru);
 			else if( op.indexFn instanceof ReduceRow )
 				computeColMxx(result, builtin);
 		}
@@ -497,12 +504,13 @@ public class ColGroupOLE extends ColGroupBitmap
 	 * 
 	 * @param result
 	 */
-	private void computeRowSums(MatrixBlock result, KahanFunction kplus)
+	private void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru)
 	{
 		KahanObject kbuff = new KahanObject(0, 0);
 	
 		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
 		final int numVals = getNumValues();
+		double[] c = result.getDenseBlock();
 		
 		//iterate over all values and their bitmaps
 		for (int k = 0; k < numVals; k++) 
@@ -514,16 +522,16 @@ public class ColGroupOLE extends ColGroupBitmap
 			
 			//iterate over bitmap blocks and add values
 			if (val != 0) {
-				int off = 0;
 				int slen;
-				for( int bix = 0; bix < blen; bix += slen + 1, off += blksz ) {
+				int bix = skipScanVal(k, rl);
+				for( int off=bix*blksz; bix<blen && off<ru; bix+=slen+1, off+=blksz ) {
 					slen = _data[boff+bix];
 					for (int i = 1; i <= slen; i++) {
 						int rix = off + _data[boff+bix + i];
-						kbuff.set(result.quickGetValue(rix, 0), result.quickGetValue(rix, 1));
+						kbuff.set(c[2*rix], c[2*rix+1]);
 						kplus.execute2(kbuff, val);
-						result.quickSetValue(rix, 0, kbuff._sum);
-						result.quickSetValue(rix, 1, kbuff._correction);
+						c[2*rix] = kbuff._sum;
+						c[2*rix+1] = kbuff._correction;
 					}
 				}
 			}
@@ -567,12 +575,12 @@ public class ColGroupOLE extends ColGroupBitmap
 	 * 
 	 * @param result
 	 */
-	private void computeRowMxx(MatrixBlock result, Builtin builtin)
+	private void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru)
 	{
 		//NOTE: zeros handled once for all column groups outside
-		
 		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
 		final int numVals = getNumValues();
+		double[] c = result.getDenseBlock();
 		
 		//iterate over all values and their bitmaps
 		for (int k = 0; k < numVals; k++) 
@@ -583,15 +591,13 @@ public class ColGroupOLE extends ColGroupBitmap
 			double val = mxxValues(k, builtin);
 			
 			//iterate over bitmap blocks and add values
-			if (val != 0) {
-				int slen;
-				for( int bix=0, off=0; bix < blen; bix += slen + 1, off += blksz ) {
-					slen = _data[boff+bix];
-					for (int i = 1; i <= slen; i++) {
-						int rix = off + _data[boff+bix + i];
-						result.quickSetValue(rix, 0, 
-							builtin.execute2(result.quickGetValue(rix, 0), val));
-					}
+			int slen;
+			int bix = skipScanVal(k, rl);
+			for( int off=bix*blksz; bix<blen && off<ru; bix+=slen+1, off+=blksz ) {
+				slen = _data[boff+bix];
+				for (int i = 1; i <= slen; i++) {
+					int rix = off + _data[boff+bix + i];
+					c[rix] = builtin.execute2(c[rix], val);
 				}
 			}
 		}
@@ -645,7 +651,6 @@ public class ColGroupOLE extends ColGroupBitmap
 		//current pos per OLs / output values
 		int[] apos = skipScan(numVals, rl);
 		
-		
 		//cache conscious count via horizontal scans 
 		for( int bi=rl; bi<ru; bi+=blksz2 )  {
 			int bimax = Math.min(bi+blksz2, ru);
@@ -660,8 +665,9 @@ public class ColGroupOLE extends ColGroupBitmap
 				//iterate over bitmap blocks and add values
 				for( int off=bi, slen=0; bix<blen && off<bimax; bix+=slen+1, off+=blksz ) {
 					slen = _data[boff+bix];
-					for (int blckIx = 1; blckIx <= slen; blckIx++)
-						rnnz[off + _data[boff+bix + blckIx] - bi] += numCols;
+					for (int blckIx = 1; blckIx <= slen; blckIx++) {
+						rnnz[off + _data[boff+bix + blckIx] - rl] += numCols;
+					}
 				}
 				
 				apos[k] = bix;
@@ -702,4 +708,28 @@ public class ColGroupOLE extends ColGroupBitmap
 		
 		return ret;
 	}
+	
+	/**
+	 * 
+	 * @param k
+	 * @param rl
+	 * @return
+	 */
+	private int skipScanVal(int k, int rl) {
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		
+		if( rl > 0 ) { //rl aligned with blksz		
+			int rskip = (getNumRows()/2/blksz)*blksz;
+			int boff = _ptr[k];
+			int blen = len(k);
+			int start = (rl>=rskip)?rskip:0;
+			int bix = (rl>=rskip)?_skiplist[k]:0;
+			for( int i=start; i<rl && bix<blen; i+=blksz ) {
+				bix += _data[boff+bix] + 1;
+			}
+			return bix;
+		}
+		
+		return 0;
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/deed646e/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
index 19fede7..35b876b 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
@@ -35,6 +35,7 @@ import org.apache.sysml.runtime.functionobjects.ReduceRow;
 import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysml.runtime.instructions.cp.KahanObject;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.Pair;
 import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 
@@ -437,6 +438,14 @@ public class ColGroupRLE extends ColGroupBitmap
 	
 	@Override
 	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result) 
+		throws DMLRuntimeException
+	{
+		unaryAggregateOperations(op, result, 0, getNumRows());
+	}
+	
+	
+	@Override
+	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru) 
 		throws DMLRuntimeException 
 	{
 		//sum and sumsq (reduceall/reducerow over tuples and counts)
@@ -448,7 +457,7 @@ public class ColGroupRLE extends ColGroupBitmap
 			if( op.indexFn instanceof ReduceAll )
 				computeSum(result, kplus);
 			else if( op.indexFn instanceof ReduceCol )
-				computeRowSums(result, kplus);
+				computeRowSums(result, kplus, rl, ru);
 			else if( op.indexFn instanceof ReduceRow )
 				computeColSums(result, kplus);
 		}
@@ -462,7 +471,7 @@ public class ColGroupRLE extends ColGroupBitmap
 			if( op.indexFn instanceof ReduceAll )
 				computeMxx(result, builtin);
 			else if( op.indexFn instanceof ReduceCol )
-				computeRowMxx(result, builtin);
+				computeRowMxx(result, builtin, rl, ru);
 			else if( op.indexFn instanceof ReduceRow )
 				computeColMxx(result, builtin);
 		}
@@ -504,10 +513,11 @@ public class ColGroupRLE extends ColGroupBitmap
 	 * 
 	 * @param result
 	 */
-	private void computeRowSums(MatrixBlock result, KahanFunction kplus)
+	private void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru)
 	{
 		KahanObject kbuff = new KahanObject(0, 0);
 		final int numVals = getNumValues();
+		double[] c = result.getDenseBlock();
 		
 		for (int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
@@ -515,16 +525,18 @@ public class ColGroupRLE extends ColGroupBitmap
 			double val = sumValues(k);
 					
 			if (val != 0.0) {
-				int curRunStartOff = 0;
-				int curRunEnd = 0;
-				for (int bix = 0; bix < blen; bix+=2) {
+				Pair<Integer,Integer> tmp = skipScanVal(k, rl);
+				int bix = tmp.getKey();
+				int curRunStartOff = tmp.getValue();
+				int curRunEnd = tmp.getValue();
+				for ( ; bix<blen && curRunEnd<ru; bix+=2) {
 					curRunStartOff = curRunEnd + _data[boff+bix];
 					curRunEnd = curRunStartOff + _data[boff+bix+1];
-					for (int rix = curRunStartOff; rix < curRunEnd; rix++) {
-						kbuff.set(result.quickGetValue(rix, 0), result.quickGetValue(rix, 1));
+					for (int rix=curRunStartOff; rix<curRunEnd && rix<ru; rix++) {
+						kbuff.set(c[2*rix], c[2*rix+1]);
 						kplus.execute2(kbuff, val);
-						result.quickSetValue(rix, 0, kbuff._sum);
-						result.quickSetValue(rix, 1, kbuff._correction);
+						c[2*rix] = kbuff._sum;
+						c[2*rix+1] = kbuff._correction;
 					}
 				}
 			}
@@ -569,28 +581,26 @@ public class ColGroupRLE extends ColGroupBitmap
 	 * 
 	 * @param result
 	 */
-	private void computeRowMxx(MatrixBlock result, Builtin builtin)
+	private void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru)
 	{
 		//NOTE: zeros handled once for all column groups outside
-		
 		final int numVals = getNumValues();
+		double[] c = result.getDenseBlock();
 		
 		for (int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
 			double val = mxxValues(k, builtin);
-					
-			if (val != 0.0) {
-				int curRunStartOff = 0;
-				int curRunEnd = 0;
-				for (int bix = 0; bix < blen; bix+=2) {
-					curRunStartOff = curRunEnd + _data[boff+bix];
-					curRunEnd = curRunStartOff + _data[boff+bix+1];
-					for (int rix = curRunStartOff; rix < curRunEnd; rix++) {
-						result.quickSetValue(rix, 0, 
-								builtin.execute2(result.quickGetValue(rix, 0), val));
-					}
-				}
+			
+			Pair<Integer,Integer> tmp = skipScanVal(k, rl);
+			int bix = tmp.getKey();
+			int curRunStartOff = tmp.getValue();
+			int curRunEnd = tmp.getValue();
+			for(; bix < blen && curRunEnd < ru; bix+=2) {
+				curRunStartOff = curRunEnd + _data[boff+bix];
+				curRunEnd = curRunStartOff + _data[boff+bix+1];
+				for (int rix=curRunStartOff; rix<curRunEnd && rix<ru; rix++)
+					c[rix] = builtin.execute2(c[rix], val);
 			}
 		}
 	}
@@ -684,4 +694,34 @@ public class ColGroupRLE extends ColGroupBitmap
 		
 		return apos;
 	}
+	
+	/**
+	 * 
+	 * @param k
+	 * @param rl
+	 * @return
+	 */
+	private Pair<Integer,Integer> skipScanVal(int k, int rl) {
+		int apos = 0; 
+		int astart = 0;
+		
+		if( rl > 0 ) { //rl aligned with blksz	
+			int boff = _ptr[k];
+			int blen = len(k);
+			int bix = 0;
+			int start = 0;
+			while( bix<blen ) {	
+				int lstart = _data[boff + bix]; //start
+				int llen = _data[boff + bix + 1]; //len
+				if( start+lstart+llen >= rl )
+					break;
+				start += lstart + llen;
+				bix += 2;
+			}
+			apos = bix;
+			astart = start;
+		}
+		
+		return new Pair<Integer,Integer>(apos, astart);
+	}
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/deed646e/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
index 2b23520..16d03b5 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
@@ -59,8 +59,8 @@ import org.apache.sysml.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysml.runtime.functionobjects.KahanPlus;
 import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.ReduceAll;
 import org.apache.sysml.runtime.functionobjects.ReduceCol;
-import org.apache.sysml.runtime.functionobjects.ReduceRow;
 import org.apache.sysml.runtime.instructions.cp.CM_COV_Object;
 import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.data.CTableMap;
@@ -1020,6 +1020,8 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			throw new DMLRuntimeException("Unary aggregates other than sum/sumsq/min/max not supported yet.");
 		}
 		
+		Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
+
 		//prepare output dimensions
 		CellIndex tempCellIndex = new CellIndex(-1,-1);
 		op.indexFn.computeDimension(rlen, clen, tempCellIndex);
@@ -1053,9 +1055,9 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		if(    op.getNumThreads() > 1 
 			&& getExactSizeOnDisk() > MIN_PAR_AGG_THRESHOLD ) 
 		{
-			
 			//multi-threaded execution of all groups 
-			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(op.getNumThreads(), false);
+			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(
+					(op.indexFn instanceof ReduceCol) ? 1 : op.getNumThreads(), false);
 			ColGroupUncompressed uc = getUncompressedColGroup();
 			try {
 				//compute uncompressed column group in parallel (otherwise bottleneck)
@@ -1064,20 +1066,26 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 				//compute all compressed column groups
 				ExecutorService pool = Executors.newFixedThreadPool( op.getNumThreads() );
 				ArrayList<UnaryAggregateTask> tasks = new ArrayList<UnaryAggregateTask>();
-				for( ArrayList<ColGroup> grp : grpParts )
-					tasks.add(new UnaryAggregateTask(grp, ret, op));
-				pool.invokeAll(tasks);	
+				if( op.indexFn instanceof ReduceCol && grpParts.length > 0 ) {
+					int seqsz = BitmapEncoder.BITMAP_BLOCK_SZ;
+					int blklen = (int)(Math.ceil((double)rlen/op.getNumThreads()));
+					blklen += (blklen%seqsz != 0)?seqsz-blklen%seqsz:0;
+					for( int i=0; i<op.getNumThreads() & i*blklen<rlen; i++ )
+						tasks.add(new UnaryAggregateTask(grpParts[0], ret, i*blklen, Math.min((i+1)*blklen,rlen), op));
+				}
+				else
+					for( ArrayList<ColGroup> grp : grpParts )
+						tasks.add(new UnaryAggregateTask(grp, ret, 0, rlen, op));
+				List<Future<MatrixBlock>> rtasks = pool.invokeAll(tasks);	
 				pool.shutdown();
 				
 				//aggregate partial results
-				if( !(op.indexFn instanceof ReduceRow) ) {
-					for( int i=0; i<ret.getNumRows(); i++ ) {
-						double val = ret.quickGetValue(i, 0);
-						for( UnaryAggregateTask task : tasks )
-							val = op.aggOp.increOp.fn.execute(val,
-									task.getResult().quickGetValue(i, 0));
-						ret.quickSetValue(i, 0, val);
-					}
+				if( op.indexFn instanceof ReduceAll ) {
+					double val = ret.quickGetValue(0, 0);
+					for( Future<MatrixBlock> rtask : rtasks )
+						val = op.aggOp.increOp.fn.execute(val, 
+								rtask.get().quickGetValue(0, 0));
+					ret.quickSetValue(0, 0, val);
 				}		
 			}
 			catch(Exception ex) {
@@ -1114,6 +1122,10 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		//post-processing
 		ret.recomputeNonZeros();
 		
+		if( LOG.isDebugEnabled() )
+			LOG.debug("Compressed uagg k="+op.getNumThreads()+" in "+time.stop());
+		
+		
 		return ret;
 	}
 	
@@ -1525,17 +1537,21 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		}
 	}
 	
-	private static class UnaryAggregateTask implements Callable<Object> 
+	private static class UnaryAggregateTask implements Callable<MatrixBlock> 
 	{
 		private ArrayList<ColGroup> _groups = null;
+		private int _rl = -1;
+		private int _ru = -1;
 		private MatrixBlock _ret = null;
 		private AggregateUnaryOperator _op = null;
 		
-		protected UnaryAggregateTask( ArrayList<ColGroup> groups, MatrixBlock ret, AggregateUnaryOperator op)  {
+		protected UnaryAggregateTask( ArrayList<ColGroup> groups, MatrixBlock ret, int rl, int ru, AggregateUnaryOperator op)  {
 			_groups = groups;
 			_op = op;
+			_rl = rl;
+			_ru = ru;
 			
-			if( !(_op.indexFn instanceof ReduceRow) ) { //sum/rowSums
+			if( _op.indexFn instanceof ReduceAll ) { //sum
 				_ret = new MatrixBlock(ret.getNumRows(), ret.getNumColumns(), false);
 				_ret.allocateDenseBlock();
 				if( _op.aggOp.increOp.fn instanceof Builtin )
@@ -1544,19 +1560,14 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			else { //colSums
 				_ret = ret;
 			}
-			System.out.println(_ret.getNonZeros());
 		}
 		
 		@Override
-		public Object call() throws DMLRuntimeException 
-		{
-			// delegate vector-matrix operation to each column group
+		public MatrixBlock call() throws DMLRuntimeException {
+			// delegate unary aggregate operation to each column group
+			// (uncompressed column group handles separately)
 			for( ColGroup grp : _groups )
-				grp.unaryAggregateOperations(_op, _ret);
-			return null;
-		}
-		
-		public MatrixBlock getResult(){
+				((ColGroupBitmap)grp).unaryAggregateOperations(_op, _ret, _rl, _ru);
 			return _ret;
 		}
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/deed646e/src/test/java/org/apache/sysml/test/integration/functions/compress/LargeParUnaryAggregateTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/compress/LargeParUnaryAggregateTest.java b/src/test/java/org/apache/sysml/test/integration/functions/compress/LargeParUnaryAggregateTest.java
new file mode 100644
index 0000000..6cd1f35
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/functions/compress/LargeParUnaryAggregateTest.java
@@ -0,0 +1,984 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.compress;
+
+import org.apache.sysml.runtime.compress.BitmapEncoder;
+import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.util.DataConverter;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+
+/**
+ * 
+ */
+public class LargeParUnaryAggregateTest extends AutomatedTestBase
+{	
+	private static final int rows = 5*BitmapEncoder.BITMAP_BLOCK_SZ;
+	private static final int cols = 20;
+	private static final double sparsity1 = 0.9;
+	private static final double sparsity2 = 0.1;
+	private static final double sparsity3 = 0.0;
+	
+	public enum SparsityType {
+		DENSE,
+		SPARSE,
+		EMPTY,
+	}
+	
+	public enum ValueType {
+		RAND,
+		RAND_ROUND,
+		CONST,
+	}
+	
+	public enum AggType {
+		ROWSUMS,
+		COLSUMS,
+		SUM,
+		ROWSUMSSQ,
+		COLSUMSSQ,
+		SUMSQ,
+		ROWMAXS,
+		COLMAXS,
+		MAX,
+		ROWMINS,
+		COLMINS,
+		MIN,
+	}
+	
+	@Override
+	public void setUp() {
+		
+	}
+	
+	@Test
+	public void testRowSumsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWSUMS, true);
+	}
+	
+	@Test
+	public void testRowSumsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testRowSumsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLSUMS, true);
+	}
+	
+	@Test
+	public void testColSumsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLSUMS, false);
+	}
+	
+	@Test
+	public void testColSumsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLSUMS, false);
+	}
+
+	@Test
+	public void testSumDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.SUM, true);
+	}
+	
+	@Test
+	public void testSumDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testSumSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.SUM, false);
+	}
+	
+	@Test
+	public void testRowSumsSqDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWSUMSSQ, true);
+	}
+	
+	@Test
+	public void testRowSumsSqSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWSUMSSQ, true);
+	}
+	
+	@Test
+	public void testRowSumsSqEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWSUMSSQ, true);
+	}
+	
+	@Test
+	public void testRowSumsSqDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWSUMSSQ, true);
+	}
+	
+	@Test
+	public void testRowSumsSqSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWSUMSSQ, true);
+	}
+	
+	@Test
+	public void testRowSumsSqDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWSUMSSQ, true);
+	}
+	
+	//@Test
+	//public void testRowSumsSqSparseConstDataCompression() {
+	//	runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWSUMSSQ, true);
+	//}
+	
+	@Test
+	public void testRowSumsSqDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testRowSumsSqSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLSUMSSQ, true);
+	}
+	
+	@Test
+	public void testColSumsSqDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLSUMSSQ, false);
+	}
+	
+	@Test
+	public void testColSumsSqSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLSUMSSQ, false);
+	}
+
+	@Test
+	public void testSumSqDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.SUMSQ, true);
+	}
+	
+	@Test
+	public void testSumSqDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.SUMSQ, false);
+	}
+	
+	@Test
+	public void testSumSqSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.SUMSQ, false);
+	}
+	
+
+	@Test
+	public void testRowMaxsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWMAXS, true);
+	}
+	
+	@Test
+	public void testRowMaxsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testRowMaxsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLMAXS, true);
+	}
+	
+	@Test
+	public void testColMaxsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLMAXS, false);
+	}
+	
+	@Test
+	public void testColMaxsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLMAXS, false);
+	}
+
+	@Test
+	public void testMaxDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.MAX, true);
+	}
+	
+	@Test
+	public void testMaxDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testMaxSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.MAX, false);
+	}
+	
+	@Test
+	public void testRowMinsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWMINS, true);
+	}
+	
+	@Test
+	public void testRowMinsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testRowMinsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.ROWMINS, false);
+	}
+	
+	@Test
+	public void testColMinsDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLMINS, true);
+	}
+	
+	@Test
+	public void testColMinsDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.COLMINS, false);
+	}
+	
+	@Test
+	public void testColMinsSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.COLMINS, false);
+	}
+
+	@Test
+	public void testMinDenseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinSparseRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinEmptyCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinDenseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinSparseRoundRandDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinDenseConstantDataCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinSparseConstDataCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.MIN, true);
+	}
+	
+	@Test
+	public void testMinDenseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinSparseRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinEmptyNoCompression() {
+		runUnaryAggregateTest(SparsityType.EMPTY, ValueType.RAND, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinDenseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.RAND_ROUND, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinSparseRoundRandDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.RAND_ROUND, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinDenseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.DENSE, ValueType.CONST, AggType.MIN, false);
+	}
+	
+	@Test
+	public void testMinSparseConstDataNoCompression() {
+		runUnaryAggregateTest(SparsityType.SPARSE, ValueType.CONST, AggType.MIN, false);
+	}
+		
+	/**
+	 * 
+	 * @param mb
+	 */
+	private void runUnaryAggregateTest(SparsityType sptype, ValueType vtype, AggType aggtype, boolean compress)
+	{
+		try
+		{
+			//prepare sparsity for input data
+			double sparsity = -1;
+			switch( sptype ){
+				case DENSE: sparsity = sparsity1; break;
+				case SPARSE: sparsity = sparsity2; break;
+				case EMPTY: sparsity = sparsity3; break;
+			}
+			
+			//generate input data
+			double min = (vtype==ValueType.CONST)? 10 : -10;
+			double[][] input = TestUtils.generateTestMatrix(rows, cols, min, 10, sparsity, 7);
+			if( vtype==ValueType.RAND_ROUND )
+				input = TestUtils.round(input);
+			MatrixBlock mb = DataConverter.convertToMatrixBlock(input);
+			mb = mb.appendOperations(MatrixBlock.seqOperations(0.1, rows-0.1, 1), new MatrixBlock()); //uc group
+			
+			//prepare unary aggregate operator
+			AggregateUnaryOperator auop = null;
+			switch (aggtype) {
+				case SUM: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uak+"); break;
+				case ROWSUMS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uark+"); break;
+				case COLSUMS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uack+"); break;
+				case SUMSQ: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uasqk+"); break;
+				case ROWSUMSSQ: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uarsqk+"); break;
+				case COLSUMSSQ: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uacsqk+"); break;
+				case MAX: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uamax"); break;
+				case ROWMAXS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uarmax"); break;
+				case COLMAXS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uacmax"); break;
+				case MIN: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uamin"); break;
+				case ROWMINS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uarmin"); break;
+				case COLMINS: auop = InstructionUtils.parseBasicAggregateUnaryOperator("uacmin"); break;
+			}
+			auop.setNumThreads(InfrastructureAnalyzer.getLocalParallelism());
+			
+			//compress given matrix block
+			CompressedMatrixBlock cmb = new CompressedMatrixBlock(mb);
+			if( compress )
+				cmb.compress();
+			
+			//matrix-vector uncompressed						
+			MatrixBlock ret1 = (MatrixBlock)mb.aggregateUnaryOperations(auop, new MatrixBlock(), 1000, 1000, null, true);
+			
+			//matrix-vector compressed
+			MatrixBlock ret2 = (MatrixBlock)cmb.aggregateUnaryOperations(auop, new MatrixBlock(), 1000, 1000, null, true);
+			
+			//compare result with input
+			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+			int dim1 = (aggtype == AggType.ROWSUMS || aggtype == AggType.ROWSUMSSQ 
+					|| aggtype == AggType.ROWMINS || aggtype == AggType.ROWMINS)?rows:1;
+			int dim2 = (aggtype == AggType.COLSUMS || aggtype == AggType.COLSUMSSQ 
+					|| aggtype == AggType.COLMAXS || aggtype == AggType.COLMINS)?cols:1;
+			TestUtils.compareMatrices(d1, d2, dim1, dim2, 0.00000000001);
+		}
+		catch(Exception ex) {
+			throw new RuntimeException(ex);
+		}
+	}
+}