You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2020/09/06 12:51:12 UTC

[systemds] branch master updated (5596fcf -> d0895af)

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git.


    from 5596fcf  [SYSTEMDS-2650] Non-recursive construction of HOPs from Lineage
     add 677cc65  [SYSTEMDS-371] ColGroup Quantization
     add 592039e  [SYSTEMDS-372] ColGroup Base Data to Byte
     add e73d196  [SYSTEMDS-373] Fix error in new compression
     add e611942  [SYSTEMDS-2615] Compressed Binary Cell Operations
     new d0895af  [SYSTEMDS-2614] Compressed Left Matrix Mult

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 dev/Tasks-obsolete.txt                             |  11 +-
 pom.xml                                            |   1 +
 src/main/java/org/apache/sysds/conf/DMLConfig.java |   6 +-
 .../compress/AbstractCompressedMatrixBlock.java    | 191 +++--
 .../sysds/runtime/compress/BitmapEncoder.java      | 556 ++++++++-----
 .../runtime/compress/CompressedMatrixBlock.java    | 922 ++++++++++++++-------
 .../compress/CompressedMatrixBlockFactory.java     | 164 ++--
 .../runtime/compress/CompressionSettings.java      |  84 +-
 .../compress/CompressionSettingsBuilder.java       | 182 +++-
 .../runtime/compress/CompressionStatistics.java    |  18 -
 .../runtime/compress/ReaderColumnSelection.java    |  18 +-
 .../compress/ReaderColumnSelectionDense.java       |  23 +-
 .../compress/ReaderColumnSelectionDenseSample.java |  29 +-
 .../compress/ReaderColumnSelectionSparse.java      |  25 +-
 .../compress/cocode/ColumnGroupPartitioner.java    |   4 +-
 .../cocode/ColumnGroupPartitionerBinPacking.java   |   4 +-
 .../cocode/ColumnGroupPartitionerStatic.java       |   7 +-
 .../runtime/compress/cocode/PlanningCoCoder.java   |  11 +-
 .../runtime/compress/colgroup/ADictionary.java     | 199 +++++
 .../sysds/runtime/compress/colgroup/ColGroup.java  | 145 ++--
 .../compress/colgroup/ColGroupConverter.java       |   8 +-
 .../runtime/compress/colgroup/ColGroupDDC.java     | 363 +++++---
 .../runtime/compress/colgroup/ColGroupDDC1.java    | 336 +-------
 .../runtime/compress/colgroup/ColGroupDDC2.java    | 282 +------
 .../runtime/compress/colgroup/ColGroupFactory.java |  75 +-
 .../runtime/compress/colgroup/ColGroupIO.java      |  49 +-
 .../runtime/compress/colgroup/ColGroupOLE.java     | 452 ++++++----
 .../runtime/compress/colgroup/ColGroupOffset.java  | 126 +--
 .../runtime/compress/colgroup/ColGroupRLE.java     | 389 +++++----
 .../runtime/compress/colgroup/ColGroupSizes.java   |  75 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  98 ++-
 .../runtime/compress/colgroup/ColGroupValue.java   | 375 +++++----
 .../compress/colgroup/DenseRowIterator.java        |   4 +-
 .../runtime/compress/colgroup/Dictionary.java      | 162 +++-
 .../compress/colgroup/DictionaryShared.java        |  79 --
 .../runtime/compress/colgroup/QDictionary.java     | 302 +++++++
 .../compress/colgroup/SparseRowIterator.java       |   4 +-
 .../compress/estim/CompressedSizeEstimator.java    |  98 +--
 .../estim/CompressedSizeEstimatorExact.java        |   8 +-
 .../estim/CompressedSizeEstimatorFactory.java      |   3 +-
 .../estim/CompressedSizeEstimatorSample.java       |  67 +-
 .../runtime/compress/estim/CompressedSizeInfo.java |   2 +-
 .../compress/estim/CompressedSizeInfoColGroup.java |  25 +-
 ...timationFactors.java => EstimationFactors.java} |  81 +-
 .../compress/estim/sample/FrequencyCount.java      |   4 +-
 .../compress/estim/sample/HassAndStokes.java       |   4 +-
 .../compress/estim/sample/ShlosserEstimator.java   |   4 +-
 .../estim/sample/ShlosserJackknifeEstimator.java   |   4 +-
 .../estim/sample/SmoothedJackknifeEstimator.java   |   4 +-
 .../sysds/runtime/compress/utils/ABitmap.java      | 100 +++
 .../{UncompressedBitmap.java => utils/Bitmap.java} |  85 +-
 .../sysds/runtime/compress/utils/BitmapLossy.java  | 113 +++
 .../compress/utils/DblArrayIntListHashMap.java     |  38 +-
 .../compress/utils/DoubleIntListHashMap.java       |  16 +-
 .../sysds/runtime/compress/utils/IntArrayList.java |  22 +-
 .../runtime/compress/utils/LinearAlgebraUtils.java |   2 +
 .../parfor/opt/CostEstimatorHops.java              |   1 +
 .../sysds/runtime/functionobjects/Builtin.java     |   5 +
 .../runtime/instructions/CPInstructionParser.java  |   2 +-
 .../instructions/cp/CompressionCPInstruction.java  |   5 +-
 .../spark/CompressionSPInstruction.java            |   2 +-
 .../matrix/data/LibMatrixCountDistinct.java        | 133 ++-
 .../sysds/runtime/matrix/data/MatrixBlock.java     |   7 +
 .../matrix/operators/LeftScalarOperator.java       |  14 +-
 .../matrix/operators/RightScalarOperator.java      |  12 +-
 .../runtime/matrix/operators/ScalarOperator.java   |  14 +-
 .../apache/sysds/runtime/util/DataConverter.java   |   2 +-
 src/test/java/org/apache/sysds/test/TestUtils.java | 137 ++-
 .../compress/AbstractCompressedUnaryTests.java     | 110 ++-
 .../component/compress/CompressedMatrixTest.java   | 267 ++----
 .../component/compress/CompressedTestBase.java     | 479 ++++++++++-
 .../component/compress/CompressedVectorTest.java   |  31 +-
 .../compress/CompressibleInputGenerator.java       | 130 ++-
 .../compress/ParCompressedMatrixTest.java          | 173 +---
 .../sysds/test/component/compress/TestBase.java    |  48 +-
 .../test/component/compress/TestConstants.java     |  32 +-
 .../compress/colgroup/JolEstimateDDCTest.java      |  49 +-
 .../compress/colgroup/JolEstimateOLETest.java      |  70 +-
 .../compress/colgroup/JolEstimateRLETest.java      |  93 +--
 .../compress/colgroup/JolEstimateTest.java         | 137 ++-
 .../compress/colgroup/JolEstimateTestEmpty.java    | 129 ---
 .../colgroup/JolEstimateUncompressedTest.java      |  13 +-
 .../test/component/matrix/CountDistinctTest.java   |  19 +-
 .../functions/countDistinct/CountDistinct.java     |   2 +-
 .../countDistinct/CountDistinctApprox.java         |  11 +-
 .../functions/countDistinct/CountDistinctBase.java |  52 +-
 .../functions/countDistinct/countDistinct.dml      |   5 +-
 .../countDistinct/countDistinctApprox.dml          |   4 +-
 88 files changed, 5015 insertions(+), 3632 deletions(-)
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictionary.java
 delete mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/DictionaryShared.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/QDictionary.java
 rename src/main/java/org/apache/sysds/runtime/compress/estim/{CompressedSizeEstimationFactors.java => EstimationFactors.java} (54%)
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
 rename src/main/java/org/apache/sysds/runtime/compress/{UncompressedBitmap.java => utils/Bitmap.java} (52%)
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTestEmpty.java


[systemds] 01/01: [SYSTEMDS-2614] Compressed Left Matrix Mult

Posted by ba...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit d0895af4b58c2d1838cc6e74e1bc81165f3e2c62
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Mon Aug 10 18:34:03 2020 +0200

    [SYSTEMDS-2614] Compressed Left Matrix Mult
    
    - This commit also contains the column mean operation.
    - Change left multiply to multiply many rows in different
    - offsets Introduced to argument list for matrixmult to allow different
      parallelization scheme
    - multiply un-compress-able columns as well
    
    Closes #1016
---
 .../apache/sysds/hops/rewrite/ProgramRewriter.java |   1 +
 .../runtime/compress/CompressedMatrixBlock.java    | 359 +++++++++++++++++----
 .../compress/CompressedMatrixBlockFactory.java     |   6 +-
 .../sysds/runtime/compress/colgroup/ColGroup.java  |  32 +-
 .../runtime/compress/colgroup/ColGroupDDC.java     | 128 +++++---
 .../runtime/compress/colgroup/ColGroupOLE.java     | 113 ++++++-
 .../runtime/compress/colgroup/ColGroupOffset.java  |  12 -
 .../runtime/compress/colgroup/ColGroupRLE.java     |  97 +++++-
 .../runtime/compress/colgroup/ColGroupSizes.java   |   9 -
 .../compress/colgroup/ColGroupUncompressed.java    |  38 +--
 .../runtime/compress/colgroup/ColGroupValue.java   |  14 +-
 .../runtime/compress/utils/LinearAlgebraUtils.java |   2 +
 .../sysds/runtime/matrix/data/MatrixBlock.java     |   7 +
 .../compress/AbstractCompressedUnaryTests.java     |  42 ++-
 .../component/compress/CompressedTestBase.java     | 134 ++++++--
 .../test/component/compress/TestConstants.java     |   2 +-
 16 files changed, 785 insertions(+), 211 deletions(-)

diff --git a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
index a225047..87df183 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
@@ -52,6 +52,7 @@ public class ProgramRewriter
 	
 	private ArrayList<HopRewriteRule> _dagRuleSet = null;
 	private ArrayList<StatementBlockRewriteRule> _sbRuleSet = null;
+
 	public ProgramRewriter() {
 		// by default which is used during initial compile 
 		// apply all (static and dynamic) rewrites
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 6fef4b5..db2b8ba 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -45,7 +45,7 @@ import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupConverter;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC1;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupIO;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupValue;
@@ -55,6 +55,7 @@ import org.apache.sysds.runtime.compress.utils.ColumnGroupIterator;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
+import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.functionobjects.Builtin;
@@ -418,24 +419,28 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 
 	}
 
-	protected void binaryMV(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, BinaryAccessType aType ){
-		if(aType == BinaryAccessType.MATRIX_COL_VECTOR){
+	protected void binaryMV(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, BinaryAccessType aType) {
+		if(aType == BinaryAccessType.MATRIX_COL_VECTOR) {
 			throw new NotImplementedException("Binary Matrix Col Vector operations are not implemented CLA");
-		}else if(aType== BinaryAccessType.MATRIX_ROW_VECTOR){
+		}
+		else if(aType == BinaryAccessType.MATRIX_ROW_VECTOR) {
 			// Apply the operation to each of the column groups.
 			// Most implementations will only modify metadata.
 			ArrayList<ColGroup> newColGroups = new ArrayList<>();
 
 			for(ColGroup grp : _colGroups) {
-				if(grp instanceof ColGroupUncompressed){
+				if(grp instanceof ColGroupUncompressed) {
 					LOG.error("NOT HANDLING UNCOMPRESSED IN BINARY MV");
-				}else{
+				}
+				else {
 
-					if(grp.getNumCols() == 1){
-						ScalarOperator sop = new LeftScalarOperator(op.fn, m2.getValue(0, grp.getColIndices()[0]),1);
+					if(grp.getNumCols() == 1) {
+						ScalarOperator sop = new LeftScalarOperator(op.fn, m2.getValue(0, grp.getColIndices()[0]), 1);
 						newColGroups.add(grp.scalarOperation(sop));
-					}else{
-						throw new NotImplementedException("Cocoded columns (nr cols:" + grp.getNumCols() + ") groupType: not implemented for Binary Matrix Row Vector operations");
+					}
+					else {
+						throw new NotImplementedException("Cocoded columns (nr cols:" + grp.getNumCols()
+							+ ") groupType: not implemented for Binary Matrix Row Vector operations");
 					}
 				}
 				// newColGroups.add(grp.binaryMVR(m2, op));
@@ -446,11 +451,11 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		}
 	}
 
-	protected void binaryVV(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, BinaryAccessType aType ){
+	protected void binaryVV(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, BinaryAccessType aType) {
 		throw new NotImplementedException("Binary Vector Vector operations are not implemented");
 	}
 
-	protected void binaryMM(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op){
+	protected void binaryMM(MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op) {
 		throw new NotImplementedException("Binary Matrix Matrix operations are not implemented");
 	}
 
@@ -584,15 +589,6 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 	public MatrixBlock aggregateBinaryOperations(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
 		AggregateBinaryOperator op) {
 
-		// Should not happen that it is a single uncompressed group.
-		// multi-threaded MM of single uncompressed ColGroup
-		// if(isSingleUncompressedGroup()) {
-		// MatrixBlock tmp = ((ColGroupUncompressed) _colGroups.get(0)).getData();
-		// return tmp.aggregateBinaryOperations(this == m1 ? tmp : m1, this == m2 ? tmp : m2, ret, op);
-		// }
-
-		Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
-
 		// setup meta data (dimensions, sparsity)
 		int rl = m1.getNumRows();
 		int cl = m2.getNumColumns();
@@ -606,11 +602,22 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		// compute matrix mult
 		if(m1.getNumRows() > 1 && m2.getNumColumns() == 1) { // MV right
 			LOG.debug("Matrix Vector !");
-			CompressedMatrixBlock cmb = (CompressedMatrixBlock) m1;
-			if(op.getNumThreads() > 1)
-				cmb.rightMultByVector(m2, ret, op.getNumThreads());
-			else
-				cmb.rightMultByVector(m2, ret);
+			if(m1 == this) {
+				if(op.getNumThreads() > 1)
+					rightMultByVector(m2, ret, op.getNumThreads());
+				else
+					rightMultByVector(m2, ret);
+			}
+			else if(m2 == this) {
+				// MatrixBlock tmpIn = new MatrixBlock(1, 1, false).allocateBlock();
+
+				leftMultByMatrix(_colGroups, m1, ret, op.getNumThreads(), 1);
+
+			}
+			else {
+				throw new DMLRuntimeException(
+					"Error in execution of aggregate Binary Operation, where m1 or m2 is not this");
+			}
 		}
 		else if(m1.getNumRows() == 1 && m2.getNumColumns() > 1) { // MV left
 			LOG.debug("Vector Matrix");
@@ -650,16 +657,12 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 				}
 			}
 			else { // MM left
-
 				LOG.debug("MM left");
-				for(int i = 0; i < that.getNumRows(); i++) {
-					tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
-					if(op.getNumThreads() > 1)
-						leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, op.getNumThreads());
-					else
-						leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, true);
-					ret.leftIndexingOperations(tmpOut, i, i, 0, ret.getNumColumns() - 1, ret, UpdateType.INPLACE);
-				}
+				// if(op.getNumThreads() > 1)
+				// leftMultByMatrixTranspose(_colGroups, m1, ret, false, op.getNumThreads());
+				// else
+				leftMultByMatrix(_colGroups, m1, ret, op.getNumThreads(), getNumColumns());
+
 			}
 		}
 
@@ -672,6 +675,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 
 		// check for supported operations
 		if(!(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq ||
+			op.aggOp.increOp.fn instanceof Mean ||
 			(op.aggOp.increOp.fn instanceof Builtin &&
 				(((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MIN ||
 					((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX)))) {
@@ -729,7 +733,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		// core unary aggregate
 		if(op.getNumThreads() > 1 && getExactSizeOnDisk() > MIN_PAR_AGG_THRESHOLD) {
 			// multi-threaded execution of all groups
-			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(
+			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(_colGroups,
 				(op.indexFn instanceof ReduceCol) ? 1 : op.getNumThreads(),
 				false);
 			ColGroupUncompressed uc = getUncompressedColGroup();
@@ -800,8 +804,9 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 
 		// special handling of mean
 		if(op.aggOp.increOp.fn instanceof Mean) {
-			if(op.indexFn instanceof ReduceAll)
+			if(op.indexFn instanceof ReduceAll) {
 				ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) / (getNumColumns() * getNumRows()));
+			}
 			else if(op.indexFn instanceof ReduceCol) {
 				for(int i = 0; i < getNumRows(); i++) {
 					ret.quickSetValue(i, 0, ret.quickGetValue(i, 0) / getNumColumns());
@@ -833,7 +838,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		int rl, int ru) {
 
 		// Seems misplaced logic for when to use CacheDDC
-		boolean cacheDDC1 = false;
+		// boolean cacheDDC1 = false;
 		// op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof KahanPlus // rowSums
 		// && ColGroupOffset.ALLOW_CACHE_CONSCIOUS_ROWSUMS && ru - rl > CompressionSettings.BITMAP_BLOCK_SZ;
 
@@ -856,7 +861,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 			// throw new RuntimeException("aggregateUnaryOperation failed to materialize matrix data");
 		}
 		for(ColGroup grp : groups)
-			if(!(grp instanceof ColGroupUncompressed) && !(cacheDDC1 && grp instanceof ColGroupDDC1))
+			if(!(grp instanceof ColGroupUncompressed))
 				grp.unaryAggregateOperations(op, c, rl, ru);
 		// LOG.debug(Arrays.toString(c));
 	}
@@ -984,6 +989,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 			for(int i = 0; i < k & i * blklen < getNumRows(); i++)
 				tasks.add(
 					new RightMatrixMultTask(_colGroups, vector, result, i * blklen, Math.min((i + 1) * blklen, rlen)));
+			
 			List<Future<Long>> ret = pool.invokeAll(tasks);
 			pool.shutdown();
 
@@ -993,7 +999,7 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 				lnnz += tmp.get();
 			result.setNonZeros(lnnz);
 		}
-		catch(InterruptedException | ExecutionException e) {
+		catch(Exception e) {
 			LOG.fatal(e);
 			throw new DMLRuntimeException(e);
 		}
@@ -1058,17 +1064,18 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		result.allocateDenseBlock();
 
 		// setup memory pool for reuse
-		if(allocTmp){
+		if(allocTmp) {
 			Pair<Integer, List<Integer>> v = getMaxNumValues(colGroups);
 			ColGroupValue.setupThreadLocalMemory(v.getLeft());
-			for(int i = 0; i< colGroups.size(); i++){
-				colGroups.get(i).leftMultByRowVector(rowVector, result, v.getRight().get(i));
+			for(int i = 0; i < colGroups.size(); i++) {
+				colGroups.get(i).leftMultByRowVector(rowVector.getDenseBlockValues(),
+					result.getDenseBlockValues(),
+					v.getRight().get(i));
 			}
 		}
-		else
-		{
+		else {
 			for(ColGroup grp : colGroups) {
-				grp.leftMultByRowVector(rowVector, result, -1);
+				grp.leftMultByRowVector(rowVector.getDenseBlockValues(), result.getDenseBlockValues(), -1);
 			}
 		}
 
@@ -1121,10 +1128,10 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 
 			// compute remaining compressed column groups in parallel
 			ExecutorService pool = CommonThreadPool.get(Math.min(colGroups.size(), k));
-			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(4 * k, true);
-			ArrayList<LeftMatrixMultTask> tasks = new ArrayList<>();
+			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(_colGroups, 4 * k, true);
+			ArrayList<LeftMatrixVectorMultTask> tasks = new ArrayList<>();
 			for(ArrayList<ColGroup> groups : grpParts)
-				tasks.add(new LeftMatrixMultTask(groups, rowVector, result));
+				tasks.add(new LeftMatrixVectorMultTask(groups, rowVector, result));
 			List<Future<Object>> ret;
 
 			ret = pool.invokeAll(tasks);
@@ -1143,6 +1150,145 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		result.recomputeNonZeros();
 	}
 
+	/**
+	 * Multiply this matrix block by a matrix (i.e. v%*%X)
+	 * 
+	 * @param colGroups  List of column groups
+	 * @param that       Left-hand operand of the multiplication
+	 * @param ret        The result matrix to insert the results
+	 * @param tmp        buffer to hold the result; must have the appropriate size already
+	 * @param tmpIn      buffer to hold a since row of input.
+	 * @param k          The number of threads used
+	 * @param numColumns The number of columns in this colGroup
+	 */
+	private static void leftMultByMatrix(List<ColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int k,
+		int numColumns) {
+		// transpose vector if required
+		// MatrixBlock result = new MatrixBlock(1, getNumColumns(), false).allocateBlock();
+		// if(op.getNumThreads() > 1)
+		// leftMultByMatrixTranspose(_colGroups, tmpIn, tmpOut, false, op.getNumThreads());
+		// else
+		// leftMultByMatrixTranspose(_colGroups, tmpIn, tmpOut, false, true);
+
+		// if(doTranspose) {
+		// rowVector = new MatrixBlock(1, vector.getNumRows(), false);
+		// LibMatrixReorg.transpose(vector, rowVector);
+		// }
+
+		ret.reset();
+		ret.allocateDenseBlock();
+		// double[] retV = ret.getDenseBlockValues();
+		// double[] thatV;
+
+		// initialize and allocate the result
+		// that.allocateDenseBlock();
+		// DenseBlock thatBlock = that.getDenseBlock();
+		// thatBlock.numBlocks();
+
+		// int blockSize = 25;// numColumns
+		if(that.isInSparseFormat()) {
+			leftMultBySparseMatrix(colGroups, that, ret, k, numColumns);
+		}
+		else {
+			leftMultByDenseMatrix(colGroups, that, ret, k, numColumns);
+		}
+		ret.recomputeNonZeros();
+	}
+
+	private static void leftMultByDenseMatrix(List<ColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int k,
+		int numColumns) {
+		DenseBlock db = that.getDenseBlock();
+		double[] retV = ret.getDenseBlockValues();
+		double[] thatV;
+		int blockU;
+		int blockL = 0;
+
+
+		for(ColGroup grp : colGroups) {
+			if(grp instanceof ColGroupUncompressed)
+				((ColGroupUncompressed) grp).leftMultByMatrix(that, ret);
+		}
+
+		// System.out.println(db.numBlocks());
+		// System.out.println(that.getDenseBlock().getClass().getSimpleName());
+		for(int b = 0; b <= db.numBlocks(); b++) {
+			int blockSize = db.blockSize(b);
+			blockU = Math.min(blockL + blockSize, ret.getNumRows());
+			thatV = db.valuesAt(b);
+			// System.out.println("Length of values in block " + (thatV.length));
+
+			if(k == 1) {
+				// TODO make move singlethreaded to not materialize and getMaxNumValues multiple times.
+				double[][] materialized = new double[colGroups.size()][];
+				// byte[][] materializedByte = new byte[colGroups.size()][];
+				for(int i = 0; i < colGroups.size(); i++) {
+					// if(colGroups.get(i) instanceof ColGroupDDC && colGroups.get(i).isLossy()) {
+					// materializedByte[i] = ((ColGroupDDC) colGroups.get(i)).getByteValues();
+					// }
+					// else {
+					materialized[i] = colGroups.get(i).getValues();
+					// }
+				}
+				Pair<Integer, List<Integer>> v = getMaxNumValues(colGroups);
+				for(int j = 0; j < colGroups.size(); j++)
+					colGroups.get(j).leftMultByMatrix(thatV,
+						retV,
+						v.getRight().get(j),
+						materialized[j],
+						that.getNumRows(),
+						ret.getNumColumns(),
+						blockL,
+						blockU,
+						0);
+			}
+			else {
+
+				try {
+					ExecutorService pool = CommonThreadPool.get(Math.min(colGroups.size(), k));
+					// compute remaining compressed column groups in parallel
+					ArrayList<LeftMatrixMatrixMultTask> tasks = new ArrayList<>();
+					List<ColGroup>[] parts = createStaticTaskPartitioningForMatrixMult(colGroups, k, false);
+					int rowBlockSize = 2;
+					for(List<ColGroup> part : parts) {
+						for(int blo = blockL; blo < blockU; blo += rowBlockSize) {
+							// int voff = (blo - blockL) * that.getNumColumns();
+
+							tasks.add(new LeftMatrixMatrixMultTask(part, thatV, retV, that.getNumRows(), numColumns,
+								blo, Math.min(blo + rowBlockSize, blockU), blo - blockL));
+
+						}
+						// tasks.add(new LeftMatrixMatrixMultTask(part, thatV, retV, that.getNumRows(), numColumns,
+						// blockL, blockU, 0));
+						// if(colGroups.get(j) instanceof ColGroupDDC && colGroups.get(j).isLossy()) {
+						// tasks.add(new LeftMatrixMatrixMultTask(colGroups.get(j), thatSlice, tmp,
+						// v.getRight().get(j), materializedByte[j]));
+						// }
+						// else {
+					}
+
+					List<Future<Object>> futures = pool.invokeAll(tasks);
+
+					pool.shutdown();
+					for(Future<Object> future : futures)
+						future.get();
+				}
+				catch(InterruptedException | ExecutionException e) {
+					e.printStackTrace();
+					throw new DMLRuntimeException(e);
+				}
+
+			}
+			blockL += blockSize;
+		}
+	}
+
+	private static void leftMultBySparseMatrix(List<ColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int k,
+		int numColumns) {
+
+		// SparseBlock sb = that.getSparseBlock();
+		throw new NotImplementedException("Sparse Block input not handled.");
+	}
+
 	private static void leftMultByTransposeSelf(List<ColGroup> groups, MatrixBlock result, int gl, int gu) {
 		final int numRows = groups.get(0).getNumRows();
 		final int numGroups = groups.size();
@@ -1194,18 +1340,19 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 	}
 
 	@SuppressWarnings("unchecked")
-	private ArrayList<ColGroup>[] createStaticTaskPartitioning(int k, boolean inclUncompressed) {
+	private static ArrayList<ColGroup>[] createStaticTaskPartitioning(List<ColGroup> colGroups, int k,
+		boolean inclUncompressed) {
 		// special case: single uncompressed col group
-		if(_colGroups.size() == 1 && _colGroups.get(0) instanceof ColGroupUncompressed) {
+		if(colGroups.size() == 1 && colGroups.get(0) instanceof ColGroupUncompressed) {
 			return new ArrayList[0];
 		}
 
 		// initialize round robin col group distribution
 		// (static task partitioning to reduce mem requirements/final agg)
-		int numTasks = Math.min(k, _colGroups.size());
+		int numTasks = Math.min(k, colGroups.size());
 		ArrayList<ColGroup>[] grpParts = new ArrayList[numTasks];
 		int pos = 0;
-		for(ColGroup grp : _colGroups) {
+		for(ColGroup grp : colGroups) {
 			if(grpParts[pos] == null)
 				grpParts[pos] = new ArrayList<>();
 			if(inclUncompressed || !(grp instanceof ColGroupUncompressed)) {
@@ -1217,16 +1364,44 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		return grpParts;
 	}
 
+	@SuppressWarnings("unchecked")
+	private static List<ColGroup>[] createStaticTaskPartitioningForMatrixMult(List<ColGroup> colGroups, int k,
+		boolean inclUncompressed) {
+		int numTasks = Math.min(k, colGroups.size());
+		List<ColGroup>[] grpParts = new ArrayList[numTasks];
+		int pos = 0;
+		for(int i = 0; i < numTasks; i++) {
+			grpParts[pos++] = new ArrayList<>();
+		}
+		pos = 0;
+		for(ColGroup grp : colGroups) {
+
+			if(grp instanceof ColGroupDDC) {
+				grpParts[pos].add((ColGroupDDC) grp);
+				pos = (pos == numTasks - 1) ? 0 : pos + 1;
+			}
+		}
+		for(ColGroup grp : colGroups) {
+			if(!(grp instanceof ColGroupDDC) && (inclUncompressed || !(grp instanceof ColGroupUncompressed))) {
+				grpParts[pos].add(grp);
+				pos = (pos == numTasks - 1) ? 0 : pos + 1;
+			}
+		}
+
+		return grpParts;
+	}
+
 	private static Pair<Integer, List<Integer>> getMaxNumValues(List<ColGroup> groups) {
 		int numVals = 1;
 		List<Integer> numValues = new ArrayList<>(groups.size());
 		int nr;
 		for(ColGroup grp : groups)
-			if(grp instanceof ColGroupValue){
+			if(grp instanceof ColGroupValue) {
 				nr = ((ColGroupValue) grp).getNumValues();
 				numValues.add(nr);
 				numVals = Math.max(numVals, nr);
-			} else{
+			}
+			else {
 				numValues.add(-1);
 			}
 		return new ImmutablePair<>(numVals, numValues);
@@ -1244,12 +1419,12 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		return null;
 	}
 
-	private static class LeftMatrixMultTask implements Callable<Object> {
+	private static class LeftMatrixVectorMultTask implements Callable<Object> {
 		private final ArrayList<ColGroup> _groups;
 		private final MatrixBlock _vect;
 		private final MatrixBlock _ret;
 
-		protected LeftMatrixMultTask(ArrayList<ColGroup> groups, MatrixBlock vect, MatrixBlock ret) {
+		protected LeftMatrixVectorMultTask(ArrayList<ColGroup> groups, MatrixBlock vect, MatrixBlock ret) {
 			_groups = groups;
 			_vect = vect;
 			_ret = ret;
@@ -1261,8 +1436,10 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 			try {
 				Pair<Integer, List<Integer>> v = getMaxNumValues(_groups);
 				ColGroupValue.setupThreadLocalMemory(v.getLeft());
-				for(int i = 0; i< _groups.size(); i++){
-					_groups.get(i).leftMultByRowVector(_vect, _ret, v.getRight().get(i));
+				for(int i = 0; i < _groups.size(); i++) {
+					_groups.get(i).leftMultByRowVector(_vect.getDenseBlockValues(),
+						_ret.getDenseBlockValues(),
+						v.getRight().get(i));
 				}
 
 				ColGroupValue.cleanupThreadLocalMemory();
@@ -1274,6 +1451,60 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 		}
 	}
 
+	private static class LeftMatrixMatrixMultTask implements Callable<Object> {
+		private final List<ColGroup> _group;
+		private final double[] _that;
+		private final double[] _ret;
+		private final int _numRows;
+		private final int _numCols;
+		private final int _rl;
+		private final int _ru;
+		private final int _vOff;
+
+		protected LeftMatrixMatrixMultTask(List<ColGroup> group, double[] that, double[] ret, int numRows, int numCols,
+			int rl, int ru, int vOff) {
+			_group = group;
+			_that = that;
+			_ret = ret;
+			_numRows = numRows;
+			_numCols = numCols;
+			_rl = rl;
+			_ru = ru;
+			_vOff = vOff;
+		}
+
+		@Override
+		public Object call() {
+			// setup memory pool for reuse
+
+			double[][] materialized = new double[_group.size()][];
+			for(int i = 0; i < _group.size(); i++) {
+				materialized[i] = _group.get(i).getValues();
+			}
+			Pair<Integer, List<Integer>> v = getMaxNumValues(_group);
+			try {
+				ColGroupValue.setupThreadLocalMemory(v.getLeft());
+				for(int j = 0; j < _group.size(); j++) {
+					_group.get(j).leftMultByMatrix(_that,
+						_ret,
+						v.getRight().get(j),
+						materialized[j],
+						_numRows,
+						_numCols,
+						_rl,
+						_ru,
+						_vOff);
+				}
+				ColGroupValue.cleanupThreadLocalMemory();
+
+			}
+			catch(Exception e) {
+				throw new DMLRuntimeException(e);
+			}
+			return null;
+		}
+	}
+
 	private static class RightMatrixMultTask implements Callable<Long> {
 		private final List<ColGroup> _groups;
 		private final MatrixBlock _vect;
@@ -1426,4 +1657,14 @@ public class CompressedMatrixBlock extends AbstractCompressedMatrixBlock {
 			blklen % CompressionSettings.BITMAP_BLOCK_SZ : 0);
 	}
 
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("\nCompressed Matrix:");
+		sb.append("\nCols:" + getNumColumns() + " Rows:" + getNumRows());
+		for(ColGroup cg : _colGroups) {
+			sb.append("\n" + cg);
+		}
+		return sb.toString();
+	}
 }
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index fde7769..e7f3893 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -111,7 +111,7 @@ public class CompressedMatrixBlockFactory {
 		LOG.debug("--compression phase 1: " + _stats.getLastTimePhase());
 
 		if(sizeInfos.colsC.isEmpty()) {
-			LOG.warn("Abort block compression because all columns are incompressible.");
+			LOG.info("Abort block compression because all columns are incompressible.");
 			return new ImmutablePair<>(new MatrixBlock().copyShallow(mb), _stats);
 		}
 		// --------------------------------------------------
@@ -168,7 +168,7 @@ public class CompressedMatrixBlockFactory {
 		_stats.ratio = _stats.originalSize / (double) _stats.size;
 
 		if(_stats.ratio < 1) {
-			LOG.warn("Abort block compression because compression ratio is less than 1.");
+			LOG.info("Abort block compression because compression ratio is less than 1.");
 			return new ImmutablePair<>(new MatrixBlock().copyShallow(mb), _stats);
 		}
 
@@ -179,7 +179,7 @@ public class CompressedMatrixBlockFactory {
 		_stats.setNextTimePhase(time.stop());
 		_stats.setColGroupsCounts(colGroupList);
 
-		LOG.info("--num col groups: " + colGroupList.size() + ", -- num input cols: " + numCols);
+		LOG.debug("--num col groups: " + colGroupList.size() + ", -- num input cols: " + numCols);
 		LOG.debug("--compression phase 5: " + _stats.getLastTimePhase());
 		LOG.debug("--col groups types " + _stats.getGroupsTypesString());
 		LOG.debug("--col groups sizes " + _stats.getGroupsSizesString());
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
index b150a57..0f6caea 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
@@ -268,7 +268,35 @@ public abstract class ColGroup implements Serializable {
 	 * @param result  matrix block result
 	 * @param numVals The Number of values contained in the Column.
 	 */
-	public abstract void leftMultByRowVector(MatrixBlock vector, MatrixBlock result, int numVals);
+	public abstract void leftMultByRowVector(double[] vector, double[] result, int numVals);
+
+	/**
+	 * Multiply the slice of the matrix that this column group represents by a row vector on the left (the original
+	 * column vector is assumed to be transposed already i.e. its size now is 1xn).
+	 * 
+	 * @param vector  row vector
+	 * @param result  matrix block result
+	 * @param numVals The Number of values contained in the Column.
+	 * @param values  The materialized list of values contained in the dictionary.
+	 */
+	public abstract void leftMultByRowVector(double[] vector, double[] result, int numVals, double[] values);
+
+	/**
+	 * Multiply the slice of the matrix that this column group represents by a row vector on the left (the original
+	 * column vector is assumed to be transposed already i.e. its size now is 1xn).
+	 * 
+	 * @param matrix  matrix to left multiply
+	 * @param result  matrix block result
+	 * @param numVals The Number of values contained in the Column.
+	 * @param values  The materialized list of values contained in the dictionary.
+	 * @param numRows The number of rows in the matrix input
+	 * @param numCols The number of columns in the colGroups parent matrix.
+	 * @param rl      The row to start the matrix multiplication from
+	 * @param ru      The row to stop the matrix multiplication at.
+	 * @param vOff    The offset into the first argument matrix to start at. 
+	 */
+	public abstract void leftMultByMatrix(double[] matrix, double[] result, int numVals, double[] values, int numRows,
+		int numCols, int rl, int ru, int vOff);
 
 	/**
 	 * Perform the specified scalar operation directly on the compressed column group, without decompressing individual
@@ -279,8 +307,6 @@ public abstract class ColGroup implements Serializable {
 	 */
 	public abstract ColGroup scalarOperation(ScalarOperator op);
 
-	// public abstract ColGroup binaryMVR(MatrixBlock m2, BinaryOperator op);
-
 	/**
 	 * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed
 	 * matrix.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index a3c7487..92db020 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -140,7 +140,7 @@ public abstract class ColGroupDDC extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
+	protected void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru, boolean mean) {
 		final int numVals = getNumValues();
 		KahanObject kbuff = new KahanObject(0, 0);
 		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
@@ -150,7 +150,7 @@ public abstract class ColGroupDDC extends ColGroupValue {
 		for(int rix = rl; rix < ru; rix++) {
 			int index = getIndex(rix);
 			if(index != numVals) {
-				setandExecute(c, kbuff, kplus2, vals[index], rix * 2);
+				setandExecute(c, kbuff, kplus2, vals[index], rix * (2 + (mean ? 1 : 0)));
 			}
 		}
 	}
@@ -173,36 +173,19 @@ public abstract class ColGroupDDC extends ColGroupValue {
 		}
 	}
 
-	protected final void postScaling(double[] vals, double[] c, int numVals) {
+	public void postScaling(double[] values, double[] vals, double[] c, int numVals) {
+		postScaling(values, vals, c, numVals, 0, 0);
+	}
+
+	public void postScaling(double[] values, double[] vals, double[] c, int numVals, int i, int totalCols) {
 		final int ncol = getNumCols();
-		// final int numVals = getNumValues();
 
-		if(_dict instanceof QDictionary) {
-			QDictionary d = (QDictionary) _dict;
-			byte[] values = d.getValuesByte();
+		for(int j = 0; j < ncol; j++) {
+			int colIx = _colIndexes[j] + i * totalCols;
 			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
 				double aval = vals[k];
 				if(valOff != numVals) {
-					for(int j = 0; j < ncol; j++) {
-						int colIx = _colIndexes[j];
-						c[colIx] += aval * values[valOff + j];
-					}
-				}
-			}
-			for(int j = 0; j < ncol; j++) {
-				int colIx = _colIndexes[j];
-				c[colIx] = c[colIx] * d._scale;
-			}
-		}
-		else {
-			double[] values = getValues();
-			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
-				double aval = vals[k];
-				if(valOff != numVals) {
-					for(int j = 0; j < ncol; j++) {
-						int colIx = _colIndexes[j];
-						c[colIx] += aval * values[valOff + j];
-					}
+					c[colIx] += aval * values[valOff + j];
 				}
 			}
 		}
@@ -248,34 +231,101 @@ public abstract class ColGroupDDC extends ColGroupValue {
 	}
 
 	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result, int numVals) {
-		double[] a = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		numVals = (numVals == -1) ? getNumValues(): numVals;
+	public void leftMultByMatrix(double[] a, double[] c, int numVals, double[] values, int numRows, int numCols, int rl,
+		int ru, int voff) {
+		numVals = (numVals == -1) ? getNumValues() : numVals;
+
+		for(int i = rl, j = voff; i < ru; i++, j++) {
+			if(8 * numVals < _numRows) {
+				// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
+				// temporary array also avoids false sharing in multi-threaded environments
+				double[] vals = preAggregate(a, numVals, j);
+				postScaling(values, vals, c, numVals, i, numCols);
+			}
+			else {
+				for(int k = 0, aOff = j  *_numRows; k < _numRows; k++, aOff++) {
+					double aval = a[aOff];
+					if(aval != 0) {
+						int valOff = getIndex(k) * _colIndexes.length;
+						if(valOff != numVals) {
+							for(int h = 0; h < _colIndexes.length; h++) {
+								int colIx = _colIndexes[h] + i * numCols;
+								c[colIx] += aval * values[valOff + h];
+							}
+						}
+					}
+				}
+			}
+		}
+	}
 
-		if(8 * numVals < _numRows) {
-			// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
-			// temporary array also avoids false sharing in multi-threaded environments
-			double[] vals = allocDVector(numVals, true);
+	@Override
+	public void leftMultByRowVector(double[] a, double[] result, int numVals) {
+		numVals = (numVals == -1) ? getNumValues() : numVals;
+		double[] values = getValues();
+
+		leftMultByRowVector(a, result, numVals, values);
+
+	}
+
+	public double[] preAggregate(double[] a, int numVals) {
+		return preAggregate(a, numVals, 0);
+	}
+
+	/**
+	 * Pre aggregates a specific row from the input a which can be a row or a matrix.
+	 * 
+	 * @param a       the input vector or matrix to multiply with
+	 * @param numVals the number of values contained in the dictionary
+	 * @param aRows   the row index from a
+	 * @return the pre-aggregated values.
+	 */
+	public double[] preAggregate(double[] a, int numVals, int aRows) {
+		double[] vals;
+		if(aRows > 0) {
+			vals = allocDVector(numVals, true);
+			// int off = _numRows * aRows;
+			for(int i = 0, off = _numRows * aRows; i < _numRows; i++, off++) {
+				int index = getIndex(i);
+				if(index != numVals) { // Since we know that multiplying with 0 is .. 0 don't begin to aggregate.
+					vals[index] += a[off];
+				}
+			}
+		}
+		else {
+			vals = allocDVector(numVals, true);
 			for(int i = 0; i < _numRows; i++) {
 				int index = getIndex(i);
 				if(index != numVals) { // Since we know that multiplying with 0 is .. 0 don't begin to aggregate.
 					vals[index] += a[i];
 				}
 			}
-			postScaling(vals, c, numVals);
+		}
+		return vals;
+	}
+
+	@Override
+	public void leftMultByRowVector(double[] a, double[] c, int numVals, double[] values) {
+		// double[] c = result.getDenseBlockValues();
+		numVals = (numVals == -1) ? getNumValues() : numVals;
+
+		if(8 * numVals < _numRows) {
+			// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
+			// temporary array also avoids false sharing in multi-threaded environments
+			double[] vals = preAggregate(a, numVals);
+			postScaling(values, vals, c, numVals);
 		}
 		else {
 			// iterate over codes, compute all, and add to the result
-			double[] values = getValues();
 			for(int i = 0; i < _numRows; i++) {
 				double aval = a[i];
 				if(aval != 0)
 					for(int j = 0, valOff = getIndex(i) * _colIndexes.length; j < _colIndexes.length; j++)
-						c[_colIndexes[j]] += aval * values[valOff + j];
+						if(valOff != numVals) {
+							c[_colIndexes[j]] += aval * values[valOff + j];
+						}
 			}
 		}
-
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index e7bf0d2..44c0c47 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -320,10 +320,12 @@ public class ColGroupOLE extends ColGroupOffset {
 			int[] apos = skipScan(numVals, rl);
 			double[] aval = preaggValues(numVals, sb);
 
+			// LOG.error(Arrays.toString(apos));
+			// LOG.error(rl + " - " + ru);
 			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int bi = rl; bi < ru; bi += blksz2) {
+			for(int bi = rl; bi < ru; bi += blksz) {
 				int bimax = Math.min(bi + blksz2, ru);
-
+				
 				// horizontal segment scan, incl pos maintenance
 				for(int k = 0; k < numVals; k++) {
 					int boff = _ptr[k];
@@ -331,13 +333,13 @@ public class ColGroupOLE extends ColGroupOffset {
 					double val = aval[k];
 					int bix = apos[k];
 
+					int len = _data[boff + bix];
+					int pos = boff + bix + 1;
+					// LOG.error("Len: "+pos +  " pos: "+bi + " ii " + len);
 					for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
 						// prepare length, start, and end pos
-						int len = _data[boff + bix];
-						int pos = boff + bix + 1;
-
 						// compute partial results
-						LinearAlgebraUtils.vectAdd(val, c, _data, pos, ii, len);
+						LinearAlgebraUtils.vectAdd(val, c, _data, pos, ii, Math.min(len, ru));
 						bix += len + 1;
 					}
 
@@ -379,13 +381,16 @@ public class ColGroupOLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result, int numVals) {
-		double[] a = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
+	public void leftMultByRowVector(double[] a, double[] c, int numVals) {
+		numVals = (numVals == -1) ? getNumValues() : numVals;
+		final double[] values = getValues();
+		leftMultByRowVector(a, c, numVals, values);
+	}
+
+	@Override
+	public void leftMultByRowVector(double[] a, double[] c, int numVals, double[] values) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
-		// final int numVals = getNumValues();
-		final double[] values = getValues();
 
 		if(numVals >= 1 && _numRows > blksz) {
 			// cache blocking config (see matrix-vector mult for explanation)
@@ -447,12 +452,92 @@ public class ColGroupOLE extends ColGroupOffset {
 	}
 
 	@Override
+	public void leftMultByMatrix(double[] a, double[] c, int numVals, double[] values, int numRows, int numCols, int rl,
+		int ru, int voff) {
+		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+		final int thisNumCols = getNumCols();
+
+		if(numVals >= 1 && _numRows > blksz) {
+
+			// cache blocking config (see matrix-vector mult for explanation)
+			final int blksz2 = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
+
+			// step 1: prepare position and value arrays
+
+			// current pos per OLs / output values
+			int[] apos = allocIVector(numVals, true);
+			double[] cvals = allocDVector(numVals, true);
+
+			for(int i = rl, off = voff* _numRows; i < ru; i++, off += _numRows) {
+				// step 2: cache conscious matrix-vector via horizontal scans
+				for(int ai = 0; ai < _numRows; ai += blksz2) {
+					int aimax = Math.min(ai + blksz2, _numRows);
+
+					// horizontal segment scan, incl pos maintenance
+					for(int k = 0; k < numVals; k++) {
+						int boff = _ptr[k];
+						int blen = len(k);
+						int bix = apos[k] + off;
+						double vsum = 0;
+
+						for(int ii = ai; ii < aimax && bix < blen; ii += blksz) {
+							// prepare length, start, and end pos
+							int len = _data[boff + bix];
+							int pos = boff + bix + 1;
+
+							// iterate over bitmap blocks and compute partial results (a[i]*1)
+							vsum += LinearAlgebraUtils.vectSum(a, _data, ii + off, pos, len);
+							bix += len + 1;
+						}
+
+						apos[k] = bix;
+						cvals[k] += vsum;
+					}
+				}
+
+				// step 3: scale partial results by values and write to global output
+				for(int k = 0, valOff = 0; k < numVals; k++, valOff += thisNumCols)
+					for(int j = 0; j < thisNumCols; j++) {
+						int colIx = _colIndexes[j] + i * numCols;
+						c[colIx] += cvals[k] * values[valOff + j];
+					}
+			}
+		}
+		else {
+
+			for(int i = rl, offR = voff* _numRows; i < ru; i++, offR += _numRows) {
+				for(int k = 0, valOff = 0; k < numVals; k++, valOff += thisNumCols) {
+					int boff = _ptr[k];
+					int blen = len(k);
+
+					// iterate over bitmap blocks and add partial results
+					double vsum = 0;
+					for(int bix = 0, off = 0; bix < blen; bix += _data[boff + bix] + 1, off += blksz)
+						vsum += LinearAlgebraUtils.vectSum(a, _data, off + offR, boff + bix + 1, _data[boff + bix]);
+
+					// scale partial results by values and write results
+
+					for(int j = 0; j < thisNumCols; j++) {
+						int colIx = _colIndexes[j] + i * numCols;
+						c[colIx] += vsum * values[valOff + j];
+					}
+				}
+			}
+		}
+	}
+
+	// @Override
+	// public void leftMultByRowVector(double[] a, double[] c, int numVals, byte[] values) {
+	// throw new NotImplementedException("Not Implemented Byte fore OLE");
+	// }
+
+	@Override
 	protected final void computeSum(double[] c, KahanFunction kplus) {
 		c[0] += _dict.sum(getCounts(), _colIndexes.length, kplus);
 	}
 
 	@Override
-	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
+	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru, boolean mean) {
 		KahanObject kbuff = new KahanObject(0, 0);
 		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
@@ -484,7 +569,7 @@ public class ColGroupOLE extends ColGroupOffset {
 						// compute partial results
 						for(int i = 0; i < len; i++) {
 							int rix = ii + _data[pos + i];
-							setandExecute(c, kbuff, kplus2, val, rix * 2);
+							setandExecute(c, kbuff, kplus2, val, rix * (2 + (mean ? 1 : 0)));
 						}
 						bix += len + 1;
 					}
@@ -509,7 +594,7 @@ public class ColGroupOLE extends ColGroupOffset {
 						slen = _data[boff + bix];
 						for(int i = 1; i <= slen; i++) {
 							int rix = off + _data[boff + bix + i];
-							setandExecute(c, kbuff, kplus2, val, rix * 2);
+							setandExecute(c, kbuff, kplus2, val, rix * (2 + (mean ? 1 : 0)));
 						}
 					}
 				}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
index 912910a..1d6f1d1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
@@ -293,18 +293,6 @@ public abstract class ColGroupOffset extends ColGroupValue {
 
 	protected abstract boolean[] computeZeroIndicatorVector();
 
-	// protected boolean findZeros(){
-		// boolean[] lind = computeZeroIndicatorVector();
-		// _zeros = false;
-		// for(boolean x : lind){
-		// 	if(x){
-		// 		_zeros = true;
-		// 		break;
-		// 	}
-		// }
-		// return _zeros;
-	// }
-
 	@Override
 	public Iterator<IJV> getIterator(int rl, int ru, boolean inclZeros, boolean rowMajor) {
 		if(rowMajor)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index b27ec8e..65658c7 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -358,12 +358,15 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result, int numVals) {
-		double[] a = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		final int numCols = getNumCols();
-		// final int numVals = getNumValues();
+	public void leftMultByRowVector(double[] a, double[] c, int numVals) {
+		numVals = (numVals == -1) ? getNumValues() : numVals;
 		final double[] values = getValues();
+		leftMultByRowVector(a, c, numVals, values);
+	}
+
+	@Override
+	public void leftMultByRowVector(double[] a, double[] c, int numVals, double[] values) {
+		final int numCols = getNumCols();
 
 		if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
 			final int blksz = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
@@ -429,6 +432,84 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
+	public void leftMultByMatrix(double[] a, double[] c, int numVals, double[] values, int numRows, int numCols, int rl,
+		int ru, int voff) {
+		// throw new NotImplementedException();
+		final int thisNumCols = getNumCols();
+			
+		if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+			final int blksz = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
+
+			// double[] aRow = new double[a.length / numRows];
+			// step 1: prepare position and value arrays
+			int[] astart = new int[numVals];
+			for(int i = rl, off = voff * _numRows; i < ru; i++, off += _numRows) {
+				// System.arraycopy(a, (a.length / numRows) * i, aRow, 0, a.length / numRows);
+				// current pos per OLs / output values
+				int[] apos = allocIVector(numVals, true);
+				double[] cvals = allocDVector(numVals, true);
+
+				// step 2: cache conscious matrix-vector via horizontal scans
+				for(int ai = 0; ai < _numRows; ai += blksz) {
+					int aimax = Math.min(ai + blksz, _numRows);
+
+					// horizontal scan, incl pos maintenance
+					for(int k = 0; k < numVals; k++) {
+						int boff = _ptr[k];
+						int blen = len(k);
+						int bix = apos[k];
+						int start = astart[k];
+
+						// compute partial results, not aligned
+						while(bix < blen & start + off < aimax) {
+							start += _data[boff + bix];
+							int len = _data[boff + bix + 1];
+							cvals[k] += LinearAlgebraUtils.vectSum(a, start + off, len);
+							start += len;
+							bix += 2;
+						}
+
+						apos[k] = bix;
+						astart[k] = start;
+					}
+				}
+
+				// step 3: scale partial results by values and write to global output
+				for(int k = 0, valOff = 0; k < numVals; k++, valOff += thisNumCols)
+					for(int j = 0; j < thisNumCols; j++){
+
+						int colIx = _colIndexes[j] + i * numCols;
+						c[colIx] += cvals[k] * values[valOff + j];
+					}
+			}
+		}
+		else {
+			// iterate over all values and their bitmaps
+			for(int i = rl, off = voff * _numRows; i < ru; i++, off += _numRows) {
+				for(int k = 0, valOff = 0; k < numVals; k++, valOff += thisNumCols) {
+					int boff = _ptr[k];
+					int blen = len(k);
+
+					double vsum = 0;
+					int curRunEnd = 0;
+					for(int bix = 0; bix < blen; bix += 2) {
+						int curRunStartOff = curRunEnd + _data[boff + bix];
+						int curRunLen = _data[boff + bix + 1];
+						vsum += LinearAlgebraUtils.vectSum(a, curRunStartOff + off, curRunLen);
+						curRunEnd = curRunStartOff + curRunLen;
+					}
+
+					for(int j = 0; j < thisNumCols; j++) {
+						int colIx = _colIndexes[j] + i * numCols;
+						// scale partial results by values and write results
+						c[colIx] += vsum * values[valOff + j];
+					}
+				}
+			}
+		}
+	}
+
+	@Override
 	public ColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 
@@ -462,7 +543,7 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
+	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru, boolean mean) {
 		KahanObject kbuff = new KahanObject(0, 0);
 		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
 
@@ -497,7 +578,7 @@ public class ColGroupRLE extends ColGroupOffset {
 						int from = Math.max(bi, start + lstart);
 						int to = Math.min(start + lstart + llen, bimax);
 						for(int rix = from; rix < to; rix++) {
-							setandExecute(c, kbuff, kplus2, val, rix);
+							setandExecute(c, kbuff, kplus2, val, rix * (2 + (mean ? 1 : 0)));
 						}
 						if(start + lstart + llen >= bimax)
 							break;
@@ -525,7 +606,7 @@ public class ColGroupRLE extends ColGroupOffset {
 						curRunStartOff = curRunEnd + _data[boff + bix];
 						curRunEnd = curRunStartOff + _data[boff + bix + 1];
 						for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++) {
-							setandExecute(c, kbuff, kplus2, val, rix * 2);
+							setandExecute(c, kbuff, kplus2, val, rix * (2 + (mean ? 1 : 0)));
 						}
 					}
 				}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
index 709975f..e18fb14 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
@@ -51,16 +51,12 @@ public class ColGroupSizes {
 
 	public static long estimateInMemorySizeDDC(int nrCols, int uniqueVals, boolean lossy) {
 		long size = estimateInMemorySizeGroupValue(nrCols, uniqueVals, lossy);
-		// if(!zeros){
-		// 	size += -nrCols * 8;
-		// }
 		return size;
 	}
 
 	public static long estimateInMemorySizeDDC1(int nrCols, int uniqueVals, int dataLength, boolean lossy) {
 		if(uniqueVals > 255)
 			return Long.MAX_VALUE;
-		// LOG.debug("DD1C: " + nrCols + " nr unique: " + uniqueVals + " DataLength: " + dataLength);
 		long size = estimateInMemorySizeDDC(nrCols, uniqueVals, lossy);
 		size += MemoryEstimates.byteArrayCost(dataLength);
 		return size;
@@ -69,14 +65,12 @@ public class ColGroupSizes {
 	public static long estimateInMemorySizeDDC2(int nrCols, int uniqueVals, int dataLength, boolean lossy) {
 		if(uniqueVals > Character.MAX_VALUE)
 			return Long.MAX_VALUE;
-		// LOG.debug("DD2C: " + nrCols + "nr unique: " + uniqueVals +" datalen: "+ dataLength);
 		long size = estimateInMemorySizeDDC(nrCols, uniqueVals, lossy);
 		size += MemoryEstimates.charArrayCost(dataLength);
 		return size;
 	}
 
 	public static long estimateInMemorySizeOffset(int nrColumns, int nrValues, int pointers, int offsetLength, boolean lossy) {
-		// LOG.debug("OFFSET list: nrC " + nrColumns +"\tnrV " + nrValues + "\tpl "+pointers +"\tdl "+ offsetLength);
 		long size = estimateInMemorySizeGroupValue(nrColumns, nrValues, lossy);
 		size += MemoryEstimates.intArrayCost(pointers);
 		size += MemoryEstimates.charArrayCost(offsetLength);
@@ -87,8 +81,6 @@ public class ColGroupSizes {
 		nrColumns = nrColumns > 0 ? nrColumns : 1;
 		offsetLength += (nrRows / CompressionSettings.BITMAP_BLOCK_SZ) * 2;
 		long size = 0;
-		// LOG.debug("OLE cols: " + nrColumns + " vals: " + nrValues + " pointers: " + (nrValues / nrColumns + 1)
-		// + " offsetLength: " + (offsetLength) + " runs: " + nrValues / nrColumns);
 		size = estimateInMemorySizeOffset(nrColumns, nrValues, (nrValues / nrColumns) + 1, offsetLength, lossy);
 		if (nrRows > CompressionSettings.BITMAP_BLOCK_SZ * 2){
 			size += MemoryEstimates.intArrayCost((int) nrValues / nrColumns);
@@ -99,7 +91,6 @@ public class ColGroupSizes {
 	public static long estimateInMemorySizeRLE(int nrColumns, int nrValues, int nrRuns, int nrRows, boolean lossy) {
 		nrColumns = nrColumns > 0 ? nrColumns : 1;
 		int offsetLength = (nrRuns) * 2;
-		// LOG.debug("\n\tRLE cols: " + nrColumns + " vals: " + nrValues + " offsetLength: " + offsetLength);
 		long size = estimateInMemorySizeOffset(nrColumns, nrValues, (nrValues / nrColumns) + 1, offsetLength, lossy);
 
 		return size;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index 74660a1..4f81210 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -274,9 +274,23 @@ public class ColGroupUncompressed extends ColGroup {
 	}
 
 	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result, int numVals) {
-		MatrixBlock pret = new MatrixBlock(1, _colIndexes.length, false);
-		LibMatrixMult.matrixMult(vector, _data, pret);
+	public void leftMultByRowVector(double[] vector, double[] c, int numVals) {
+		throw new NotImplementedException("Should not be called use other matrix function");
+	}
+
+	@Override
+	public void leftMultByRowVector(double[] vector, double[] c, int numVals, double[] values) {
+		throw new NotImplementedException("Should not be called use other matrix function");
+	}
+
+	@Override
+	public void leftMultByMatrix(double[] vector, double[] c, int numVals, double[] values, int numRows, int numCols, int rl, int ru, int vOff) {
+		throw new NotImplementedException("Should not be called use other matrix function");
+	}
+
+	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result) {
+		MatrixBlock pret = new MatrixBlock(matrix.getNumRows(), _colIndexes.length, false);
+		LibMatrixMult.matrixMult(matrix, _data, pret);
 
 		// copying partialResult to the proper indices of the result
 		if(!pret.isEmptyBlock(false)) {
@@ -287,24 +301,6 @@ public class ColGroupUncompressed extends ColGroup {
 		}
 	}
 
-	// @Override
-	// public void leftMultByRowVector(ColGroupDDC vector, MatrixBlock result) {
-	// 	throw new NotImplementedException();
-	// }
-
-	// public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) {
-	// 	MatrixBlock pret = new MatrixBlock(1, _colIndexes.length, false);
-	// 	LibMatrixMult.matrixMult(vector, _data, pret, k);
-
-	// 	// copying partialResult to the proper indices of the result
-	// 	if(!pret.isEmptyBlock(false)) {
-	// 		double[] rsltArr = result.getDenseBlockValues();
-	// 		for(int colIx = 0; colIx < _colIndexes.length; colIx++)
-	// 			rsltArr[_colIndexes[colIx]] = pret.quickGetValue(0, colIx);
-	// 		result.recomputeNonZeros();
-	// 	}
-	// }
-
 	@Override
 	public ColGroup scalarOperation(ScalarOperator op) {
 		// execute scalar operations
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index 3b184a2..d58b243 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -34,6 +34,7 @@ import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
 import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
+import org.apache.sysds.runtime.functionobjects.Mean;
 import org.apache.sysds.runtime.functionobjects.ReduceAll;
 import org.apache.sysds.runtime.functionobjects.ReduceCol;
 import org.apache.sysds.runtime.functionobjects.ReduceRow;
@@ -113,6 +114,10 @@ public abstract class ColGroupValue extends ColGroup {
 		return _dict.getValues();
 	}
 
+	public byte[] getByteValues() {
+		return ((QDictionary)_dict).getValuesByte();
+	}
+
 	@Override
 	public MatrixBlock getValuesAsBlock() {
 		final double[] values = getValues();
@@ -263,14 +268,15 @@ public abstract class ColGroupValue extends ColGroup {
 	@Override
 	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int rl, int ru) {
 		// sum and sumsq (reduceall/reducerow over tuples and counts)
-		if(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq) {
-			KahanFunction kplus = (op.aggOp.increOp.fn instanceof KahanPlus) ? KahanPlus
+		if(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq || op.aggOp.increOp.fn  instanceof Mean) {
+			KahanFunction kplus = (op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof Mean) ? KahanPlus
 				.getKahanPlusFnObject() : KahanPlusSq.getKahanPlusSqFnObject();
+			boolean mean = op.aggOp.increOp.fn  instanceof Mean;
 
 			if(op.indexFn instanceof ReduceAll)
 				computeSum(c, kplus);
 			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(c, kplus, rl, ru);
+				computeRowSums(c, kplus, rl, ru, mean);
 			else if(op.indexFn instanceof ReduceRow)
 				computeColSums(c, kplus);
 		}
@@ -406,7 +412,7 @@ public abstract class ColGroupValue extends ColGroup {
 
 	protected abstract void computeSum(double[] c, KahanFunction kplus);
 
-	protected abstract void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru);
+	protected abstract void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru, boolean mean);
 
 	protected abstract void computeColSums(double[] c, KahanFunction kplus);
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/LinearAlgebraUtils.java b/src/main/java/org/apache/sysds/runtime/compress/utils/LinearAlgebraUtils.java
index b24d6f1..4093b25 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/LinearAlgebraUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/LinearAlgebraUtils.java
@@ -51,6 +51,7 @@ public class LinearAlgebraUtils {
 		LibMatrixMult.vectAdd(a, c, ai, ci, len);
 	}
 
+
 	public static void vectAdd(final double aval, double[] c, char[] bix, final int bi, final int ci, final int len) {
 		final int bn = len % 8;
 
@@ -69,6 +70,7 @@ public class LinearAlgebraUtils {
 			c[ci + bix[j + 6]] += aval;
 			c[ci + bix[j + 7]] += aval;
 		}
+
 	}
 
 	public static void vectAdd(final double aval, double[] c, final int ci, final int len) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index ed52481..0eb484a 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -207,6 +207,13 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		}
 	}
 	
+	public MatrixBlock(int rl, int cl, DenseBlock dBlock){
+		rlen = rl;
+		clen = cl;
+		sparse = false;
+		denseBlock = dBlock;
+	}
+
 	////////
 	// Initialization methods
 	// (reset, init, allocate, etc)
diff --git a/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java b/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
index 560d234..dd94aa2 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
@@ -43,7 +43,8 @@ public abstract class AbstractCompressedUnaryTests extends CompressedTestBase {
 	}
 
 	enum AggType {
-		ROWSUMS, COLSUMS, SUM, ROWSUMSSQ, COLSUMSSQ, SUMSQ, ROWMAXS, COLMAXS, MAX, ROWMINS, COLMINS, MIN, MEAN
+		ROWSUMS, COLSUMS, SUM, ROWSUMSSQ, COLSUMSSQ, SUMSQ, ROWMAXS, COLMAXS, MAX, ROWMINS, COLMINS, MIN, MEAN, COLMEAN,
+		ROWMEAN
 	}
 
 	@Test
@@ -106,14 +107,21 @@ public abstract class AbstractCompressedUnaryTests extends CompressedTestBase {
 		testUnaryOperators(AggType.MIN);
 	}
 
-	@Test(expected = NotImplementedException.class)
+	@Test
 	public void testUnaryOperator_MEAN() {
-		// if Input was not compressed then just pass test
-		if(!(cmb instanceof CompressedMatrixBlock))
-			throw new NotImplementedException("Test Passed");
 		testUnaryOperators(AggType.MEAN);
 	}
 
+	@Test
+	public void testUnaryOperator_COLMEAN() {
+		testUnaryOperators(AggType.COLMEAN);
+	}
+
+	@Test
+	public void testUnaryOperator_ROWMEAN() {
+		testUnaryOperators(AggType.ROWMEAN);
+	}
+
 	protected AggregateUnaryOperator getUnaryOperator(AggType aggType, int threads) {
 		switch(aggType) {
 			case SUM:
@@ -142,6 +150,10 @@ public abstract class AbstractCompressedUnaryTests extends CompressedTestBase {
 				return InstructionUtils.parseBasicAggregateUnaryOperator("uacmin", threads);
 			case MEAN:
 				return InstructionUtils.parseBasicAggregateUnaryOperator("uamean", threads);
+			case ROWMEAN:
+				return InstructionUtils.parseBasicAggregateUnaryOperator("uarmean", threads);
+			case COLMEAN:
+				return InstructionUtils.parseBasicAggregateUnaryOperator("uacmean", threads);
 			default:
 				throw new NotImplementedException("Not Supported Aggregate Unary operator in test");
 		}
@@ -162,9 +174,9 @@ public abstract class AbstractCompressedUnaryTests extends CompressedTestBase {
 			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
 			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
 			int dim1 = (aggType == AggType.ROWSUMS || aggType == AggType.ROWSUMSSQ || aggType == AggType.ROWMAXS ||
-				aggType == AggType.ROWMINS) ? rows : 1;
+				aggType == AggType.ROWMINS || aggType == AggType.ROWMEAN) ? rows : 1;
 			int dim2 = (aggType == AggType.COLSUMS || aggType == AggType.COLSUMSSQ || aggType == AggType.COLMAXS ||
-				aggType == AggType.COLMINS) ? cols : 1;
+				aggType == AggType.COLMINS || aggType == AggType.COLMEAN) ? cols : 1;
 
 			assertTrue("dim 1 is equal in non compressed res", d1.length == dim1);
 			assertTrue("dim 1 is equal in compressed res", d2.length == dim1);
@@ -181,15 +193,25 @@ public abstract class AbstractCompressedUnaryTests extends CompressedTestBase {
 				}
 				else if(aggType == AggType.SUM) {
 					TestUtils.compareMatrices(d1, d2, lossyTolerance * 10 * cols * rows, css);
-
+				}
+				else if(aggType == AggType.MEAN) {
+					TestUtils.compareMatrices(d1, d2, lossyTolerance * cols * rows, css);
+				}
+				else if(aggType == AggType.ROWMEAN) {
+					TestUtils.compareMatrices(d1, d2, lossyTolerance, css);
 				}
 				else {
 					boolean ignoreZero = true;
-					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.1, 0.9, css, ignoreZero);
+					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.8, 0.9, css, ignoreZero);
 				}
 			}
 			else {
-				TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 30, css);
+				if(aggType == AggType.ROWMEAN) {
+					TestUtils.compareMatrices(d1, d2, 0.0001, css);
+				}
+				else {
+					TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 30, css);
+				}
 			}
 		}
 		catch(NotImplementedException e) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 2440c2d..b9276da 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -19,7 +19,7 @@
 
 package org.apache.sysds.test.component.compress;
 
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -57,7 +57,7 @@ public abstract class CompressedTestBase extends TestBase {
 	protected static final Log LOG = LogFactory.getLog(CompressedTestBase.class.getName());
 
 	protected static SparsityType[] usedSparsityTypes = new SparsityType[] { // Sparsity 0.9, 0.1, 0.01 and 0.0
-		SparsityType.FULL,
+		// SparsityType.FULL,
 		// SparsityType.DENSE,
 		SparsityType.SPARSE,
 		// SparsityType.ULTRA_SPARSE,
@@ -67,14 +67,13 @@ public abstract class CompressedTestBase extends TestBase {
 	protected static ValueType[] usedValueTypes = new ValueType[] {
 		// ValueType.RAND,
 		// ValueType.CONST,
-		ValueType.RAND_ROUND,
+		// ValueType.RAND_ROUND,
 		ValueType.OLE_COMPRESSIBLE,
 		// ValueType.RLE_COMPRESSIBLE,
 	};
 
-	protected static ValueRange[] usedValueRanges = new ValueRange[] {
-		ValueRange.SMALL, 
-		// ValueRange.LARGE,
+	protected static ValueRange[] usedValueRanges = new ValueRange[] {ValueRange.SMALL,
+		ValueRange.LARGE,
 		// ValueRange.BYTE
 	};
 
@@ -84,30 +83,30 @@ public abstract class CompressedTestBase extends TestBase {
 		// CLA TESTS!
 
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setInvestigateEstimate(true)
-			.create(),
+		.create(),
 		new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
-			.setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).create(),
+		.setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).create(),
 
-		// LOSSY TESTS!
+		// // LOSSY TESTS!
 
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).setLossy(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).setLossy(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).setLossy(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).setLossy(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
-			.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).setLossy(true).create(),
+		.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).setLossy(true).create(),
 
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setInvestigateEstimate(true)
 			.create(),
 
-		// new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
-		// .setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).setLossy(true).create(),
+		new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
+		.setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).setLossy(true).create(),
 
 		// COCODING TESTS!!
 
@@ -122,12 +121,12 @@ public abstract class CompressedTestBase extends TestBase {
 
 	protected static MatrixTypology[] usedMatrixTypology = new MatrixTypology[] { // Selected Matrix Types
 		MatrixTypology.SMALL,
-		// MatrixTypology.FEW_COL,
+		MatrixTypology.FEW_COL,
 		// MatrixTypology.FEW_ROW,
 		// MatrixTypology.LARGE,
 		MatrixTypology.SINGLE_COL,
 		// MatrixTypology.SINGLE_ROW,
-		MatrixTypology.L_ROWS,
+		// MatrixTypology.L_ROWS,
 		// MatrixTypology.XL_ROWS,
 	};
 
@@ -173,16 +172,17 @@ public abstract class CompressedTestBase extends TestBase {
 		}
 		catch(Exception e) {
 			e.printStackTrace();
-			assertTrue("\nCompressionTest Init failed with settings: " + this.toString(), false);
+			fail("\nCompressionTest Init failed with settings: " + this.toString());
 		}
 
 	}
-
+	/**
+	 * Tolerance for encoding values is the maximum value in dataset divided by number distinct values available in
+	 * a single Byte (since we encode our quntization in Byte)
+	 * 
+	 * @param valueRange The value range used as input
+	 */
 	private void setLossyTolerance(ValueRange valueRange) {
-		/**
-		 * Tolerance for encoding values is the maximum value in dataset divided by number distinct values available in
-		 * a single Byte (since we encode our quntization in Byte)
-		 */
 		lossyTolerance = (double) (Math.max(TestConstants.getMaxRangeValue(valueRange),
 			Math.abs(TestConstants.getMinRangeValue(valueRange)))) * (1.0 / 127.0) / 2.0;
 		// LOG.debug("TOLERANCE IN TEST:" + lossyTolerance);
@@ -394,6 +394,86 @@ public abstract class CompressedTestBase extends TestBase {
 	}
 
 	@Test
+	public void testLeftMatrixMatrixMultSmall() {
+		try {
+			if(!(cmb instanceof CompressedMatrixBlock))
+				return; // Input was not compressed then just pass test
+
+			MatrixBlock matrix = DataConverter
+				.convertToMatrixBlock(TestUtils.generateTestMatrix(2, rows, 0.9, 1.5, 1.0, 3));
+
+			// Make Operator
+			AggregateBinaryOperator abop = InstructionUtils.getMatMultOperator(_k);
+
+			// vector-matrix uncompressed
+			MatrixBlock ret1 = mb.aggregateBinaryOperations(matrix, mb, new MatrixBlock(), abop);
+
+			// vector-matrix compressed
+			MatrixBlock ret2 = cmb.aggregateBinaryOperations(matrix, cmb, new MatrixBlock(), abop);
+
+			// compare result with input
+			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+			if(compressionSettings.lossy) {
+				TestUtils.compareMatricesPercentageDistance(d1, d2, 0.25, 0.96, compressionSettings.toString());
+			}
+			else {
+				// rows
+				if(rows > 65000) {
+					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.01, 0.99, compressionSettings.toString());
+				}
+				else {
+
+					TestUtils.compareMatricesBitAvgDistance(d1, d2, 10000, 500, compressionSettings.toString());
+				}
+			}
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
+		}
+	}
+
+	@Test
+	public void testLeftMatrixMatrixMultMedium() {
+		try {
+			if(!(cmb instanceof CompressedMatrixBlock))
+				return; // Input was not compressed then just pass test
+
+			MatrixBlock matrix = DataConverter
+				.convertToMatrixBlock(TestUtils.generateTestMatrix(402, rows, 0.9, 1.5, 1.0, 3));
+
+			// Make Operator
+			AggregateBinaryOperator abop = InstructionUtils.getMatMultOperator(_k);
+
+			// vector-matrix uncompressed
+			MatrixBlock ret1 = mb.aggregateBinaryOperations(matrix, mb, new MatrixBlock(), abop);
+
+			// vector-matrix compressed
+			MatrixBlock ret2 = cmb.aggregateBinaryOperations(matrix, cmb, new MatrixBlock(), abop);
+
+			// compare result with input
+			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+			if(compressionSettings.lossy) {
+				TestUtils.compareMatricesPercentageDistance(d1, d2, 0.25, 0.96, compressionSettings.toString());
+			}
+			else {
+				if(rows > 65000) {
+					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.01, 0.99, compressionSettings.toString());
+				}
+				else {
+					TestUtils.compareMatricesBitAvgDistance(d1, d2, 10000, 500, compressionSettings.toString());
+				}
+			}
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
+		}
+	}
+
+	@Test
 	public void testTransposeSelfMatrixMult() {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock))
@@ -413,9 +493,7 @@ public abstract class CompressedTestBase extends TestBase {
 				double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
 				// High probability that The value is off by some amount
 				if(compressionSettings.lossy) {
-					/**
-					 * Probably the worst thing you can do to increase the amount the values are estimated wrong
-					 */
+					//Probably the worst thing you can do to increase the amount the values are estimated wrong
 					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.0, 0.8, compressionSettings.toString());
 				}
 				else {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
index a74c138..47895e8 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
@@ -24,7 +24,7 @@ package org.apache.sysds.test.component.compress;
  */
 public class TestConstants {
 
-	private static final int rows[] = {4, 2008, 1283, 5, 1, 251, 5000, 100000, 3123};
+	private static final int rows[] = {4, 2008, 1283, 5, 1, 251, 5000, 70000, 3123};
 	private static final int cols[] = {20, 20, 13, 998, 321, 1, 8, 10, 1};
 	private static final double[] sparsityValues = {0.9, 0.1, 0.01, 0.0, 1.0};