You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2021/06/01 10:43:13 UTC

[systemds] 06/07: [SYSTEMDS-2998] CLA Offset and Mapping Tests

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 9bfd7ff26a4e840805c209b7becc13498f10b61d
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Mon May 24 12:21:09 2021 +0200

    [SYSTEMDS-2998] CLA Offset and Mapping Tests
    
    Add various tests compression
    
    - InsertionSorterTests
    - OffsetTests
    - MappingTests
    
    Minor bug fixes and better mapping test.
    
    Better compression ration on SDC with 3 distinct elements since the
    dictionary contains number of distinct elements -1 for SDC. Therefore if
    the SDC contain 3 distinct values, it only need 2 distinct identifiers
    in the dictionary.
---
 .../runtime/compress/CompressedMatrixBlock.java    |   86 +-
 .../compress/CompressedMatrixBlockFactory.java     |   27 +-
 .../runtime/compress/CompressionSettings.java      |    6 +-
 .../compress/CompressionSettingsBuilder.java       |   38 +-
 .../runtime/compress/CompressionStatistics.java    |   14 +-
 .../compress/cocode/CoCodeCostMatrixMult.java      |    6 +-
 .../runtime/compress/cocode/CoCodeCostTSMM.java    |  188 +++
 .../runtime/compress/cocode/PlanningCoCoder.java   |   14 +-
 .../sysds/runtime/compress/colgroup/AColGroup.java |  271 +----
 .../compress/colgroup/ColGroupCompressed.java      |   39 +-
 .../runtime/compress/colgroup/ColGroupConst.java   |  306 ++---
 .../runtime/compress/colgroup/ColGroupDDC.java     |  576 +++++-----
 .../runtime/compress/colgroup/ColGroupEmpty.java   |   48 +-
 .../runtime/compress/colgroup/ColGroupFactory.java |   84 +-
 .../runtime/compress/colgroup/ColGroupOLE.java     |  658 +++++------
 .../runtime/compress/colgroup/ColGroupRLE.java     |  630 +++++-----
 .../runtime/compress/colgroup/ColGroupSDC.java     |  701 +++++------
 .../compress/colgroup/ColGroupSDCSingle.java       |  630 +++++-----
 .../compress/colgroup/ColGroupSDCSingleZeros.java  |  377 +++---
 .../compress/colgroup/ColGroupSDCZeros.java        |  473 ++++----
 .../runtime/compress/colgroup/ColGroupSizes.java   |   15 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  298 +++--
 .../runtime/compress/colgroup/ColGroupValue.java   |  716 ++++++------
 .../compress/colgroup/dictionary/ADictionary.java  |  137 ++-
 .../compress/colgroup/dictionary/Dictionary.java   |   77 +-
 .../colgroup/dictionary/DictionaryFactory.java     |  101 +-
 .../colgroup/dictionary/MatrixBlockDictionary.java | 1213 ++++++++++----------
 .../compress/colgroup/dictionary/QDictionary.java  |   36 +-
 .../colgroup/insertionsort/AInsertionSorter.java   |   68 ++
 .../insertionsort/InsertionSorterFactory.java      |   47 +
 .../{tree => insertionsort}/MaterializeSort.java   |   69 +-
 .../{tree => insertionsort}/MergeSort.java         |  104 +-
 .../compress/colgroup/mapping/MapToBit.java        |   24 +-
 .../compress/colgroup/mapping/MapToByte.java       |    4 +-
 .../compress/colgroup/mapping/MapToChar.java       |    4 +-
 .../compress/colgroup/mapping/MapToFactory.java    |   45 +-
 .../compress/colgroup/mapping/MapToInt.java        |    4 +-
 .../runtime/compress/colgroup/offset/AOffset.java  |   15 +-
 .../compress/colgroup/offset/OffsetByte.java       |    6 +-
 .../compress/colgroup/offset/OffsetChar.java       |    7 +-
 .../compress/colgroup/offset/OffsetFactory.java    |   30 +-
 .../compress/colgroup/pre/IPreAggregate.java       |   79 --
 .../compress/colgroup/pre/MapPreAggregate.java     |   62 -
 .../compress/colgroup/pre/PreAggregateFactory.java |   41 -
 .../compress/colgroup/tree/AInsertionSorter.java   |   95 --
 .../colgroup/tree/InsertionSorterFactory.java      |   33 -
 .../runtime/compress/colgroup/tree/Naive.java      |  136 ---
 .../compress/estim/CompressedSizeEstimator.java    |    8 +-
 .../estim/CompressedSizeEstimatorSample.java       |    3 +-
 .../sysds/runtime/compress/lib/BitmapEncoder.java  |    2 +-
 .../runtime/compress/lib/BitmapLossyEncoder.java   |   54 +-
 .../runtime/compress/lib/CLALibBinaryCellOp.java   |   48 +-
 .../runtime/compress/lib/CLALibLeftMultBy.java     |  189 +--
 .../runtime/compress/lib/CLALibRelationalOp.java   |    6 +-
 .../sysds/runtime/compress/lib/CLALibSquash.java   |   13 +-
 .../compress/readers/ReaderColumnSelection.java    |    3 +-
 .../readers/ReaderColumnSelectionBitSet.java       |   16 +-
 .../ReaderColumnSelectionDenseMultiBlock.java      |    8 +-
 ...erColumnSelectionDenseMultiBlockTransposed.java |   12 +-
 .../ReaderColumnSelectionDenseSingleBlock.java     |   11 +-
 ...rColumnSelectionDenseSingleBlockTransposed.java |   18 +-
 .../readers/ReaderColumnSelectionSparse.java       |    4 +-
 .../ReaderColumnSelectionSparseTransposed.java     |    5 +-
 .../sysds/runtime/compress/utils/DblArray.java     |    4 +
 .../runtime/matrix/data/RandomMatrixGenerator.java |    2 +-
 src/test/java/org/apache/sysds/test/TestUtils.java |   33 +-
 .../compress/AbstractCompressedUnaryTests.java     |   13 +-
 .../component/compress/CompressedMatrixTest.java   |  281 ++++-
 .../component/compress/CompressedTestBase.java     |  323 ++++--
 .../component/compress/CompressedVectorTest.java   |   11 +-
 .../compress/ParCompressedMatrixTest.java          |   30 +-
 .../sysds/test/component/compress/TestBase.java    |   86 +-
 .../test/component/compress/TestConstants.java     |   28 +-
 .../compress/colgroup/JolEstimateTest.java         |   35 +-
 .../insertionsort/TestInsertionSorters.java        |  128 +++
 .../compress/insertionsorter/MergeSortTest.java    |  276 -----
 .../component/compress/mapping/MappingTests.java   |  182 +++
 .../compress/offset/OffsetNegativeTests.java       |   90 ++
 .../compress/offset/OffsetSingleTests.java}        |   38 +-
 .../component/compress/offset/OffsetTests.java     |  132 ++-
 80 files changed, 5562 insertions(+), 5168 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 27d77ed..aff61b8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -159,15 +159,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	public CompressedMatrixBlock(CompressedMatrixBlock that) {
 		super(that.getNumRows(), that.getNumColumns(), true);
-		sparseBlock = null;
-		denseBlock = null;
-		nonZeros = that.getNonZeros();
-
-		_colGroups = new ArrayList<>();
-		for(AColGroup cg : that._colGroups)
-			_colGroups.add(cg.copy());
-
-		overlappingColGroups = that.overlappingColGroups;
+		this.copyCompressedMatrix(that);
 	}
 
 	public boolean isSingleUncompressedGroup() {
@@ -175,6 +167,13 @@ public class CompressedMatrixBlock extends MatrixBlock {
 			_colGroups.get(0).getCompType() == CompressionType.UNCOMPRESSED);
 	}
 
+	/**
+	 * Allocate the given column group and remove all references to old column groups.
+	 * 
+	 * This is done by simply allocating a ned _colGroups list and adding the given column group
+	 * 
+	 * @param cg The column group to use after.
+	 */
 	public void allocateColGroup(AColGroup cg) {
 		_colGroups = new ArrayList<>(1);
 		_colGroups.add(cg);
@@ -305,7 +304,8 @@ public class CompressedMatrixBlock extends MatrixBlock {
 					ColGroupUncompressed guc = (ColGroupUncompressed) g;
 					MatrixBlock gMB = guc.getData();
 					// Make sure that it is the correct dimensions
-					if(gMB.getNumColumns() == this.getNumColumns() && gMB.getNumRows() == this.getNumRows()) {
+					if(gMB.getNumColumns() == this.getNumColumns() && gMB.getNumRows() == this.getNumRows() &&
+						!gMB.isEmpty() && !gMB.isInSparseFormat()) {
 						_colGroups.remove(i);
 						return gMB;
 					}
@@ -666,20 +666,23 @@ public class CompressedMatrixBlock extends MatrixBlock {
 	@Override
 	public MatrixBlock transposeSelfMatrixMultOperations(MatrixBlock out, MMTSJType tstype, int k) {
 		// check for transpose type
-		if(tstype != MMTSJType.LEFT) // right not supported yet
+		if(tstype == MMTSJType.LEFT) {
+			if(isEmptyBlock()) {
+				return new MatrixBlock(clen, clen, true);
+			}
+			// create output matrix block
+			if(out == null)
+				out = new MatrixBlock(clen, clen, false);
+			else
+				out.reset(clen, clen, false);
+			out.allocateDenseBlock();
+			CLALibLeftMultBy.leftMultByTransposeSelf(_colGroups, out, k, getNumColumns(), getMaxNumValues(),
+				isOverlapping());
+			return out;
+		}
+		else {
 			throw new DMLRuntimeException("Invalid MMTSJ type '" + tstype.toString() + "'.");
-		if(isEmptyBlock())
-			return new MatrixBlock(clen, clen, true);
-		// create output matrix block
-		if(out == null)
-			out = new MatrixBlock(clen, clen, false);
-		else
-			out.reset(clen, clen, false);
-		out.allocateDenseBlock();
-		// compute matrix mult
-		CLALibLeftMultBy.leftMultByTransposeSelf(_colGroups, out, k, getNumColumns(), getMaxNumValues(),
-			isOverlapping());
-		return out;
+		}
 	}
 
 	@Override
@@ -758,7 +761,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 			// decompress row partition
 			for(AColGroup grp : _colGroups)
-				grp.decompressToBlock(_ret, _rl, _ru, false);
+				grp.decompressToBlockUnSafe(_ret, _rl, _ru);
 
 			// post processing (sort due to append)
 			if(_ret.isInSparseFormat())
@@ -771,7 +774,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		sb.append("\nCompressed Matrix:");
+		sb.append("CompressedMatrixBlock:");
 		sb.append("\nCols:" + getNumColumns() + " Rows:" + getNumRows() + " Overlapping: " + isOverlapping() + " nnz: "
 			+ nonZeros);
 		if(_colGroups != null)
@@ -811,7 +814,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 			// and it is not inclusive in decompression, and construction of MatrixBlock.
 			tmp = new MatrixBlock(ru + 1 - rl, getNumColumns(), false).allocateDenseBlock();
 			for(AColGroup g : getColGroups())
-				g.decompressToBlock(tmp, rl, ru + 1, 0);
+				g.decompressToBlockUnSafe(tmp, rl, ru + 1, 0);
 			tmp.recomputeNonZeros();
 			return tmp;
 		}
@@ -882,25 +885,25 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	@Override
 	public double max() {
-		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uamax", -1);
+		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uamax", 1);
 		return aggregateUnaryOperations(op, null, 1000, null).getValue(0, 0);
 	}
 
 	@Override
 	public double min() {
-		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uamin", -1);
+		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uamin", 1);
 		return aggregateUnaryOperations(op, null, 1000, null).getValue(0, 0);
 	}
 
 	@Override
 	public double sum() {
-		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uak+", -1);
+		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uak+", 1);
 		return aggregateUnaryOperations(op, null, 1000, null).getValue(0, 0);
 	}
 
 	@Override
 	public double sumSq() {
-		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uasqk+", -1);
+		AggregateUnaryOperator op = InstructionUtils.parseBasicAggregateUnaryOperator("uasqk+", 1);
 		return aggregateUnaryOperations(op, null, 1000, null).getValue(0, 0);
 	}
 
@@ -993,7 +996,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 		AColGroup grp = _colGroups.get(0);
 		MatrixBlock vals = grp.getValuesAsBlock();
 		if(grp instanceof ColGroupValue) {
-			MatrixBlock counts = getCountsAsBlock( ((ColGroupValue) grp).getCounts());
+			MatrixBlock counts = getCountsAsBlock(((ColGroupValue) grp).getCounts());
 			if(counts.isEmpty())
 				return vals.cmOperations(op);
 			return vals.cmOperations(op, counts);
@@ -1224,6 +1227,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	@Override
 	public MatrixBlock randOperationsInPlace(RandomMatrixGenerator rgen, Well1024a bigrand, long bSeed) {
+		LOG.info("Inplace rand ops not on CompressedMatrix");
 		MatrixBlock ret = new MatrixBlock(getNumRows(), getNumColumns(), true);
 		LibMatrixDatagen.generateRandomMatrix(ret, rgen, bigrand, bSeed);
 		return ret;
@@ -1231,6 +1235,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	@Override
 	public MatrixBlock randOperationsInPlace(RandomMatrixGenerator rgen, Well1024a bigrand, long bSeed, int k) {
+		LOG.info("Inplace rand ops not on CompressedMatrix");
 		MatrixBlock ret = new MatrixBlock(getNumRows(), getNumColumns(), true);
 		LibMatrixDatagen.generateRandomMatrix(ret, rgen, bigrand, bSeed, k);
 		return ret;
@@ -1284,37 +1289,36 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	@Override
 	public void copy(MatrixValue thatValue) {
-		CompressedMatrixBlock that = checkType(thatValue);
-		if(this == that) // prevent data loss (e.g., on sparse-dense conversion)
-			throw new RuntimeException("Copy must not overwrite itself!");
-
+		copy(thatValue, false);
 	}
 
 	private static CompressedMatrixBlock checkType(MatrixValue thatValue) {
-		if(thatValue == null || !(thatValue instanceof CompressedMatrixBlock)) {
+		if(thatValue == null || !(thatValue instanceof CompressedMatrixBlock))
 			throw new DMLRuntimeException("Invalid call to copy, requre a compressed MatrixBlock to copy to");
-		}
+
 		return (CompressedMatrixBlock) thatValue;
 	}
 
 	@Override
 	public void copy(MatrixValue thatValue, boolean sp) {
 		CompressedMatrixBlock that = checkType(thatValue);
-
+		if(this == that) // prevent data loss (e.g., on sparse-dense conversion)
+			throw new RuntimeException("Copy must not overwrite itself!");
 		copyCompressedMatrix(that);
 	}
 
 	private void copyCompressedMatrix(CompressedMatrixBlock that) {
-		if(this == that) // prevent data loss (e.g., on sparse-dense conversion)
-			throw new RuntimeException("Copy must not overwrite itself!");
 		this.rlen = that.rlen;
 		this.clen = that.clen;
-
+		this.sparseBlock = null;
+		this.denseBlock = null;
 		this.nonZeros = that.getNonZeros();
+
 		this._colGroups = new ArrayList<>();
 		for(AColGroup cg : that._colGroups)
 			_colGroups.add(cg.copy());
 
 		overlappingColGroups = that.overlappingColGroups;
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 1756018..914a707 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -122,9 +122,9 @@ public class CompressedMatrixBlockFactory {
 	 */
 	public static CompressedMatrixBlock createConstant(int numRows, int numCols, double value) {
 		CompressedMatrixBlock block = new CompressedMatrixBlock(numRows, numCols);
-		ColGroupConst cg = ColGroupConst.genColGroupConst(numRows, numCols, value);
+		AColGroup cg = ColGroupFactory.genColGroupConst(numRows, numCols, value);
 		block.allocateColGroup(cg);
-		block.setNonZeros(value == 0.0 ? 0 : numRows * numCols);
+		block.recomputeNonZeros();
 		return block;
 	}
 
@@ -157,13 +157,11 @@ public class CompressedMatrixBlockFactory {
 	private void classifyPhase() {
 		CompressedSizeEstimator sizeEstimator = CompressedSizeEstimatorFactory.getSizeEstimator(mb, compSettings);
 		CompressedSizeInfo sizeInfos = sizeEstimator.computeCompressedSizeInfos(k);
-
-		if(compSettings.investigateEstimate)
-			_stats.estimatedSizeCols = sizeInfos.memoryEstimate();
-
+		_stats.estimatedSizeCols = sizeInfos.memoryEstimate();
 		logPhase();
-
-		if(_stats.estimatedSizeCols < _stats.originalSize || compSettings.columnPartitioner == PartitionerType.COST_MATRIX_MULT)
+		
+		if(_stats.estimatedSizeCols < _stats.originalSize ||
+			compSettings.columnPartitioner == PartitionerType.COST_MATRIX_MULT)
 			coCodePhase(sizeEstimator, sizeInfos, mb.getNumRows());
 		else {
 			LOG.info("Estimated Size of singleColGroups: " + _stats.estimatedSizeCols);
@@ -281,9 +279,9 @@ public class CompressedMatrixBlockFactory {
 		res.cleanupBlock(true, true);
 
 		_stats.size = res.estimateCompressedSizeInMemory();
-		
+
 		final double ratio = _stats.getRatio();
-		if(ratio < 1 && compSettings.columnPartitioner != PartitionerType.COST_MATRIX_MULT)  {
+		if(ratio < 1 && compSettings.columnPartitioner != PartitionerType.COST_MATRIX_MULT) {
 			LOG.info("--dense size:        " + _stats.denseSize);
 			LOG.info("--original size:     " + _stats.originalSize);
 			LOG.info("--compressed size:   " + _stats.size);
@@ -304,7 +302,7 @@ public class CompressedMatrixBlockFactory {
 	private Pair<MatrixBlock, CompressionStatistics> abortCompression() {
 		LOG.warn("Compression aborted at phase: " + phase);
 		if(compSettings.transposed)
-			LibMatrixReorg.transposeInPlace(mb,k);
+			LibMatrixReorg.transposeInPlace(mb, k);
 		return new ImmutablePair<>(mb, _stats);
 	}
 
@@ -346,10 +344,9 @@ public class CompressedMatrixBlockFactory {
 					LOG.debug("--compression ratio: " + _stats.getRatio());
 					int[] lengths = new int[res.getColGroups().size()];
 					int i = 0;
-					for(AColGroup colGroup : res.getColGroups()) {
-						if(colGroup.getValues() != null)
-							lengths[i++] = colGroup.getValues().length / colGroup.getColIndices().length;
-					}
+					for(AColGroup colGroup : res.getColGroups())
+						lengths[i++] = colGroup.getNumValues();
+
 					LOG.debug("--compressed colGroup dictionary sizes: " + Arrays.toString(lengths));
 					if(LOG.isTraceEnabled()) {
 						for(AColGroup colGroup : res.getColGroups()) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index 895600d..aeabc92 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -73,9 +73,6 @@ public class CompressionSettings {
 	/** If the seed is -1 then the system used system millisecond time and class hash for seeding. */
 	public final int seed;
 
-	/** Boolean specifying if the compression strategy should be investigated and monitored. */
-	public final boolean investigateEstimate;
-
 	/** True if lossy compression is enabled */
 	public final boolean lossy;
 
@@ -103,7 +100,7 @@ public class CompressionSettings {
 	public final int minimumSampleSize;
 
 	protected CompressionSettings(double samplingRatio, boolean allowSharedDictionary, String transposeInput,
-		boolean skipList, int seed, boolean investigateEstimate, boolean lossy,
+		boolean skipList, int seed, boolean lossy,
 		EnumSet<CompressionType> validCompressions, boolean sortValuesByLength, PartitionerType columnPartitioner,
 		int maxColGroupCoCode, double coCodePercentage, int minimumSampleSize) {
 		this.samplingRatio = samplingRatio;
@@ -111,7 +108,6 @@ public class CompressionSettings {
 		this.transposeInput = transposeInput;
 		this.skipList = skipList;
 		this.seed = seed;
-		this.investigateEstimate = investigateEstimate;
 		this.validCompressions = validCompressions;
 		this.lossy = lossy;
 		this.sortValuesByLength = sortValuesByLength;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index 83d01e5..3ec42a0 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -36,12 +36,11 @@ public class CompressionSettingsBuilder {
 	private String transposeInput;
 	private boolean skipList = true;
 	private int seed = -1;
-	private boolean investigateEstimate = true;
 	private boolean lossy = false;
 	private EnumSet<CompressionType> validCompressions;
 	private boolean sortValuesByLength = true;
 	private PartitionerType columnPartitioner;
-	private int maxStaticColGroupCoCode = 10000;
+	private int maxColGroupCoCode = 10000;
 	private double coCodePercentage = 0.01;
 	private int minimumSampleSize = 2000;
 
@@ -74,9 +73,15 @@ public class CompressionSettingsBuilder {
 		this.samplingRatio = that.samplingRatio;
 		this.allowSharedDictionary = that.allowSharedDictionary;
 		this.transposeInput = that.transposeInput;
+		this.skipList = that.skipList;
 		this.seed = that.seed;
-		this.investigateEstimate = that.investigateEstimate;
+		this.lossy = that.lossy;
 		this.validCompressions = EnumSet.copyOf(that.validCompressions);
+		this.sortValuesByLength = that.sortValuesByLength;
+		this.columnPartitioner = that.columnPartitioner;
+		this.maxColGroupCoCode = that.maxColGroupCoCode;
+		this.coCodePercentage = that.coCodePercentage;
+		this.minimumSampleSize = that.minimumSampleSize;
 		return this;
 	}
 
@@ -170,17 +175,6 @@ public class CompressionSettingsBuilder {
 	}
 
 	/**
-	 * Set if the compression should be investigated while compressing.
-	 * 
-	 * @param investigateEstimate A boolean specifying it the input should be estimated.
-	 * @return The CompressionSettingsBuilder
-	 */
-	public CompressionSettingsBuilder setInvestigateEstimate(boolean investigateEstimate) {
-		this.investigateEstimate = investigateEstimate;
-		return this;
-	}
-
-	/**
 	 * Set the valid compression strategies used for the compression.
 	 * 
 	 * @param validCompressions An EnumSet of CompressionTypes to use in the compression
@@ -230,14 +224,14 @@ public class CompressionSettingsBuilder {
 	}
 
 	/**
-	 * Set the maximum number of columns to CoCode together in the static CoCoding strategy. Compression time increase
-	 * with higher numbers.
+	 * Set the maximum number of columns to CoCode together in the CoCoding strategy. Compression time increase with
+	 * higher numbers.
 	 * 
-	 * @param maxStaticColGroupCoCode The max selected.
+	 * @param maxColGroupCoCode The max selected.
 	 * @return The CompressionSettingsBuilder
 	 */
-	public CompressionSettingsBuilder setmaxStaticColGroupCoCode(int maxStaticColGroupCoCode) {
-		this.maxStaticColGroupCoCode = maxStaticColGroupCoCode;
+	public CompressionSettingsBuilder setMaxColGroupCoCode(int maxColGroupCoCode) {
+		this.maxColGroupCoCode = maxColGroupCoCode;
 		return this;
 	}
 
@@ -273,8 +267,8 @@ public class CompressionSettingsBuilder {
 	 * @return The CompressionSettings
 	 */
 	public CompressionSettings create() {
-		return new CompressionSettings(samplingRatio, allowSharedDictionary, transposeInput, skipList, seed,
-			investigateEstimate, lossy, validCompressions, sortValuesByLength, columnPartitioner,
-			maxStaticColGroupCoCode, coCodePercentage, minimumSampleSize);
+		return new CompressionSettings(samplingRatio, allowSharedDictionary, transposeInput, skipList, seed, lossy,
+			validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage,
+			minimumSampleSize);
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java
index 466953a..f35d417 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionStatistics.java
@@ -94,15 +94,21 @@ public class CompressionStatistics {
 	}
 
 	public double getRatio() {
-		return (double) originalSize / size;
+		return size == 0.0 ? Double.POSITIVE_INFINITY : (double) originalSize / size;
 	}
 
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		sb.append("Compression Statistics:\n");
-		sb.append("\t" + getGroupsTypesString() + "\n");
-		sb.append("\t" + getGroupsSizesString() + "\n");
+		sb.append("CompressionStatistics:\n");
+		sb.append("Dense Size       : " + denseSize);
+		sb.append("Original Size    : " + originalSize);
+		sb.append("Compressed Size  : " + size);
+		sb.append("CompressionRatio : " + getRatio());
+		if(colGroupCounts != null){
+			sb.append("\t" + getGroupsTypesString() + "\n");
+			sb.append("\t" + getGroupsSizesString() + "\n");
+		}
 		return sb.toString();
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
index 0d39b47..53f3243 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
@@ -110,10 +110,10 @@ public class CoCodeCostMatrixMult extends AColumnCoCoder {
 
 			final int numberTuples = elm.getNumVals();
 			final double tupleSparsity = elm.getTupleSparsity();
-			final double postScalingCost = (nCols > 1 && elm.getTupleSparsity() > 0.4) ? numberTuples *
-				nCols : numberTuples * nCols * tupleSparsity;
+			final double postScalingCost = (nCols > 1 && tupleSparsity > 0.4) ? numberTuples * nCols : numberTuples *
+				nCols * tupleSparsity;
 
-			this.cost = preAggregateCost + postScalingCost ;
+			this.cost = preAggregateCost + postScalingCost;
 		}
 
 		@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostTSMM.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostTSMM.java
new file mode 100644
index 0000000..f31c53f
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostTSMM.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.cocode;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Queue;
+
+import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorSample;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+
+public class CoCodeCostTSMM extends AColumnCoCoder {
+
+	protected CoCodeCostTSMM(CompressedSizeEstimator e, CompressionSettings cs) {
+		super(e, cs);
+	}
+
+	@Override
+	protected CompressedSizeInfo coCodeColumns(CompressedSizeInfo colInfos, int k) {
+
+		List<CompressedSizeInfoColGroup> joinRes = join(colInfos.getInfo());
+
+		if(_cs.samplingRatio < 0.1 && _est instanceof CompressedSizeEstimatorSample) {
+			LOG.debug("Performing second join with double sample rate");
+			CompressedSizeEstimatorSample estS = (CompressedSizeEstimatorSample) _est;
+			estS.sampleData(estS.getSample().getNumRows() * 2);
+			List<int[]> colG = new ArrayList<>(joinRes.size());
+			for(CompressedSizeInfoColGroup g : joinRes)
+				colG.add(g.getColumns());
+
+			joinRes = join(estS.computeCompressedSizeInfos(colG, k));
+		}
+
+		colInfos.setInfo(joinRes);
+
+		return colInfos;
+	}
+
+	private List<CompressedSizeInfoColGroup> join(List<CompressedSizeInfoColGroup> currentGroups) {
+
+		Queue<CompressedSizeInfoColGroup> que = new PriorityQueue<>(currentGroups.size(),
+			new Comparator<CompressedSizeInfoColGroup>() {
+				@Override
+				public int compare(CompressedSizeInfoColGroup a, CompressedSizeInfoColGroup b) {
+					final int aNV = a.getNumVals();
+					final int bNV = b.getNumVals();
+					if(aNV == bNV)
+						return 0;
+					else if(aNV > bNV)
+						return 1;
+					else
+						return -1;
+				}
+			});
+
+		List<CompressedSizeInfoColGroup> ret = new ArrayList<>();
+		for(CompressedSizeInfoColGroup g : currentGroups)
+			que.add(g);
+
+		double currentCost = getCost(que, ret);
+		while(true) {
+			if(que.peek() != null) {
+				final CompressedSizeInfoColGroup l = que.poll();
+				if(que.peek() != null) {
+					final CompressedSizeInfoColGroup r = que.poll();
+					final CompressedSizeInfoColGroup g = joinWithAnalysis(l, r);
+					final double newCost = getCost(que, ret, g);
+					if(newCost < currentCost) {
+						currentCost = newCost;
+						que.add(g);
+					}
+					else {
+						ret.add(l);
+						que.add(r);
+					}
+				}
+				else {
+					ret.add(l);
+					break;
+				}
+			}
+			else
+				break;
+		}
+
+		for(CompressedSizeInfoColGroup g : que)
+			ret.add(g);
+
+		return ret;
+	}
+
+	private double getCost(Queue<CompressedSizeInfoColGroup> que, List<CompressedSizeInfoColGroup> ret) {
+		CompressedSizeInfoColGroup[] queValues = que.toArray(new CompressedSizeInfoColGroup[que.size()]);
+		return getCost(queValues, ret);
+	}
+
+	private double getCost(Queue<CompressedSizeInfoColGroup> que, List<CompressedSizeInfoColGroup> ret,
+		CompressedSizeInfoColGroup g) {
+		CompressedSizeInfoColGroup[] queValues = que.toArray(new CompressedSizeInfoColGroup[que.size()]);
+		double cost = getCost(queValues, ret);
+		cost += getCostOfSelfTSMM(g);
+		for(int i = 0; i < queValues.length; i++)
+			cost += getCostOfLeftTransposedMM(queValues[i], g);
+
+		for(int i = 0; i < ret.size(); i++)
+			cost += getCostOfLeftTransposedMM(ret.get(i), g);
+		return cost;
+	}
+
+	private double getCost(CompressedSizeInfoColGroup[] queValues, List<CompressedSizeInfoColGroup> ret) {
+		double cost = 0;
+		for(int i = 0; i < queValues.length; i++) {
+			cost += getCostOfSelfTSMM(queValues[i]);
+			for(int j = i + 1; j < queValues.length; j++)
+				cost += getCostOfLeftTransposedMM(queValues[i], queValues[j]);
+
+			for(CompressedSizeInfoColGroup g : ret)
+				cost += getCostOfLeftTransposedMM(queValues[i], g);
+
+		}
+		for(int i = 0; i < ret.size(); i++) {
+			cost += getCostOfSelfTSMM(ret.get(i));
+			for(int j = i + 1; j < ret.size(); j++)
+				cost += getCostOfLeftTransposedMM(ret.get(i), ret.get(j));
+
+		}
+		return cost;
+	}
+
+	private double getCostOfSelfTSMM(CompressedSizeInfoColGroup g) {
+		double cost = 0;
+		final int nCol = g.getColumns().length;
+		cost += g.getNumVals() * (nCol * (nCol + 1)) / 2;
+		return cost;
+	}
+
+	private double getCostOfLeftTransposedMM(CompressedSizeInfoColGroup gl, CompressedSizeInfoColGroup gr) {
+		final int nRows = _est.getNumRows();
+		final int nColsL = gl.getColumns().length;
+		final int nColsR = gl.getColumns().length;
+
+		// final double preAggLeft = (nRows / (1 - gl.getMostCommonFraction())) * nColsL;
+		// final double preAggRight = (nRows / (1 - gr.getMostCommonFraction())) * nColsR;
+
+		final double preAggLeft = nRows;
+		final double preAggRight = nRows;
+
+		final double tsL = gl.getTupleSparsity();
+		final double tsR = gr.getTupleSparsity();
+
+		// final double tsL = 1;
+		// final double tsR = 1;
+
+		final int nvL = gl.getNumVals();
+		final int nvR = gr.getNumVals();
+
+		final double postScaleLeft = nColsL > 1 && tsL > 0.4 ? nvL * nColsL : nvL * nColsL * tsL;
+		final double postScaleRight = nColsR > 1 && tsR > 0.4 ? nvR * nColsR : nvR * nColsR * tsR;
+
+		final double costLeft = preAggLeft + postScaleLeft * 5;
+		final double costRight = preAggRight + postScaleRight * 5;
+
+		return Math.min(costLeft, costRight);
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
index 3bae0e5..6074d1d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
@@ -41,7 +41,17 @@ public class PlanningCoCoder {
 	 * The Valid coCoding techniques
 	 */
 	public enum PartitionerType {
-		BIN_PACKING, STATIC, COST, COST_MATRIX_MULT;
+		BIN_PACKING, STATIC, COST, COST_MATRIX_MULT, COST_TSMM;
+
+		public static boolean isCostBased( PartitionerType pt) {
+			switch(pt) {
+				case COST_MATRIX_MULT:
+				case COST_TSMM:
+					return true;
+				default:
+					return false;
+			}
+		}
 	}
 
 	/**
@@ -100,6 +110,8 @@ public class PlanningCoCoder {
 				return new CoCodeCost(est, cs);
 			case COST_MATRIX_MULT:
 				return new CoCodeCostMatrixMult(est, cs);
+			case COST_TSMM:
+				return new CoCodeCostTSMM(est, cs);
 			default:
 				throw new RuntimeException("Unsupported column group partitioner: " + type.toString());
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index c1daeb9..0e68d0a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -23,19 +23,17 @@ import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.utils.MemoryEstimates;
 
-import edu.emory.mathcs.backport.java.util.Arrays;
-
 /**
  * Abstract Class that is the lowest class type for the Compression framework.
  * 
@@ -58,34 +56,6 @@ public abstract class AColGroup implements Serializable {
 	 */
 	protected enum ColGroupType {
 		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros;
-
-		/**
-		 * Get the super type of the specific ColGroup Type used.
-		 * 
-		 * @param c The concrete ColGroupType
-		 * @return The super CompressionType.
-		 */
-		public static CompressionType getSuperType(ColGroupType c) {
-			switch(c) {
-				case RLE:
-					return CompressionType.RLE;
-				case OLE:
-					return CompressionType.OLE;
-				case DDC:
-					return CompressionType.DDC;
-				case CONST:
-					return CompressionType.CONST;
-				case EMPTY:
-					return CompressionType.EMPTY;
-				case SDC:
-				case SDCSingle:
-				case SDCSingleZeros:
-				case SDCZeros:
-					return CompressionType.SDC;
-				default:
-					return CompressionType.UNCOMPRESSED;
-			}
-		}
 	}
 
 	/** The ColGroup Indexes contained in the ColGroup */
@@ -101,10 +71,6 @@ public abstract class AColGroup implements Serializable {
 	 * @param colIndices offsets of the columns in the matrix block that make up the group
 	 */
 	protected AColGroup(int[] colIndices) {
-		if(colIndices == null)
-			throw new DMLRuntimeException("null input to ColGroup is invalid");
-		if(colIndices.length == 0)
-			throw new DMLRuntimeException("0 is an invalid number of columns in a ColGroup");
 		_colIndexes = colIndices;
 	}
 
@@ -118,16 +84,6 @@ public abstract class AColGroup implements Serializable {
 	}
 
 	/**
-	 * Obtain a column index value.
-	 * 
-	 * @param colNum column number
-	 * @return column index value
-	 */
-	public int getColIndex(int colNum) {
-		return _colIndexes[colNum];
-	}
-
-	/**
 	 * Set the column indexes of the column group.
 	 * 
 	 * @param colIndexes
@@ -183,7 +139,7 @@ public abstract class AColGroup implements Serializable {
 	 * 
 	 * @param offset The offset to move all columns
 	 */
-	public void shiftColIndices(int offset) {
+	public final void shiftColIndices(int offset) {
 		for(int i = 0; i < _colIndexes.length; i++)
 			_colIndexes[i] += offset;
 	}
@@ -193,81 +149,33 @@ public abstract class AColGroup implements Serializable {
 	 * 
 	 * @return an upper bound on the number of bytes used to store this ColGroup in memory.
 	 */
-	public long estimateInMemorySize(){
+	public long estimateInMemorySize() {
 		long size = 16; // object header
 		size += MemoryEstimates.intArrayCost(_colIndexes.length);
 		return size;
 	}
 
 	/**
-	 * Decompress the contents of this column group into the specified full matrix block.
+	 * Decompress the contents of this column group into the specified full matrix block while managing the number of
+	 * non zeros.
 	 * 
 	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
 	 * @param rl     row lower
 	 * @param ru     row upper
+	 * @param offT   Offset into target to assign from
 	 */
-	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		decompressToBlock(target, rl, ru, rl, true);
-	}
-
-	/**
-	 * Decompress the contents of this column group into the specified full matrix block.
-	 * 
-	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     The row to start at
-	 * @param ru     The row to end at
-	 * @param offT   The rowOffset into target to decompress to.
-	 */
-	public void decompressToBlock(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlock(target, rl, ru, offT, true);
-	}
-
-
-	/**
-	 * Decompress the contents of this column group into the target matrixBlock, it is assumed that the target matrix
-	 * Block have the same number of columns and at least the number of rows ru.
-	 * 
-	 * @param target The target matrixBlock to decompress into
-	 * @param rl     The row to start at
-	 * @param ru     The row to end at
-	 * @param safe   Boolean specifying if the operation should be safe, aka counting nnz.
-	 */
-	public void decompressToBlock(MatrixBlock target, int rl, int ru, boolean safe) {
-		decompressToBlock(target, rl, ru, rl, safe);
-	}
-
-
-	/**
-	 * Decompress the contents of this column group into the target matrixBlock with an offset of the indexes using the
-	 * values provided as replacement of the dictionary values, it is assumed that the target matrix Block have the same
-	 * number of columns and at least the number of rows ru.
-	 * 
-	 * The offset of indexes makes it possible to decompress parts of the compressed column group like say rows 10 to
-	 * 20, into row 0 to 10 in the target matrix.
-	 * 
-	 * @param target The target matrixBlock to decompress into
-	 * @param rl     The row to start at
-	 * @param ru     The row to end at
-	 * @param offT   The offset into the target to decompress to.
-	 * @param safe   Boolean specifying if the operation should be safe, aka counting nnz.
-	 */
-	public void decompressToBlock(MatrixBlock target, int rl, int ru, int offT,  boolean safe) {
-		if(safe)
-			decompressToBlockSafe(target, rl, ru, offT);
-		else
-			decompressToBlockUnSafe(target, rl, ru, offT);
-	}
+	public abstract void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT);
 
 	/**
-	 * Decompress the contents of this column group into the specified full matrix block while managing the number of
-	 * non zeros.
+	 * Decompress the contents of the columngroup unsafely, meaning that it does not count nonzero values.
 	 * 
 	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
 	 * @param rl     row lower
 	 * @param ru     row upper
-	 * @param offT   Offset into target to assign from
 	 */
-	public abstract void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT);
+	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru) {
+		decompressToBlockUnSafe(target, rl, ru, rl);
+	}
 
 	/**
 	 * Decompress the contents of the columngroup unsafely, meaning that it does not count nonzero values.
@@ -279,136 +187,31 @@ public abstract class AColGroup implements Serializable {
 	 */
 	public abstract void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT);
 
-	/**
-	 * Decompress the contents of this column group into uncompressed packed columns
-	 * 
-	 * @param target          a dense matrix block. The block must have enough space to hold the contents of this column
-	 *                        group.
-	 * @param colIndexTargets array that maps column indices in the original matrix block to columns of target.
-	 */
-	public abstract void decompressToBlock(MatrixBlock target, int[] colIndexTargets);
-
-	/**
-	 * Decompress an entire column into the target matrix block. This decompression maintain the number of non zeros.
-	 * This method assumes that the Matrix block that is decompressed into has a column for the values to decompress
-	 * into.
-	 * 
-	 * @param target    Target matrix block to decompress into.
-	 * @param colIndex  The column index to decompress.
-	 * @param colGroups The list of column groups to decompress.
-	 */
-	public static void decompressColumnToBlock(MatrixBlock target, int colIndex, List<AColGroup> colGroups) {
-		for(AColGroup g : colGroups) {
-			int groupColIndex = Arrays.binarySearch(g._colIndexes, colIndex);
-			if(groupColIndex >= 0) {
-				g.decompressColumnToBlock(target, groupColIndex);
-			}
-		}
-	}
-
-	/**
-	 * Find all column groups with the given index and decompress them into the target double array summing the values.
-	 * 
-	 * If the column is not found nothing is decompressed.
-	 * 
-	 * @param target    The target column array to decompress into
-	 * @param colIndex  The Column index to find in the list of column groups
-	 * @param colGroups The column Groups to search in.
-	 */
-	public static void decompressColumnToArray(double[] target, int colIndex, List<AColGroup> colGroups) {
-		for(AColGroup g : colGroups) {
-			int groupColIndex = Arrays.binarySearch(g._colIndexes, colIndex);
-			if(groupColIndex >= 0) {
-				g.decompressColumnToBlock(target, groupColIndex, 0, g.getNumRows());
-			}
-		}
-	}
+	// /**
+	//  * Decompress the contents of this column group into uncompressed packed columns
+	//  * 
+	//  * @param target          a dense matrix block. The block must have enough space to hold the contents of this column
+	//  *                        group.
+	//  * @param colIndexTargets array that maps column indices in the original matrix block to columns of target.
+	//  */
+	// public abstract void decompressToBlock(MatrixBlock target, int[] colIndexTargets);
 
 	/**
 	 * Decompress part of the col groups into the target matrix block, this decompression maintain the number of non
 	 * zeros.
 	 * 
 	 * @param target    The Target matrix block to decompress into
-	 * @param colIndex  The column index to decompress.
 	 * @param rl        The row to start the decompression from
 	 * @param ru        The row to end the decompression at
 	 * @param colGroups The list of column groups to decompress.
 	 */
-	public static void decompressColumnToBlock(MatrixBlock target, int colIndex, int rl, int ru,
+	public final static void decompressColumnToBlockUnSafe(MatrixBlock target, int rl, int ru,
 		List<AColGroup> colGroups) {
-		for(AColGroup g : colGroups) {
-			int groupColIndex = Arrays.binarySearch(g._colIndexes, colIndex);
-			if(groupColIndex >= 0) {
-				g.decompressColumnToBlock(target, groupColIndex, rl, ru);
-			}
-		}
-	}
-
-	/**
-	 * Decompress part of the col groups into the target matrix block, this decompression maintain the number of non
-	 * zeros.
-	 * 
-	 * @param target    The Target matrix block to decompress into
-	 * @param rl        The row to start the decompression from
-	 * @param ru        The row to end the decompression at
-	 * @param colGroups The list of column groups to decompress.
-	 */
-	public static void decompressColumnToBlockUnSafe(MatrixBlock target, int rl, int ru, List<AColGroup> colGroups) {
 		for(AColGroup g : colGroups)
 			g.decompressToBlockUnSafe(target, rl, ru, rl);
 	}
 
 	/**
-	 * Decompress part of the col groups into the target dense double array. This assumes that the double array is a row
-	 * linearized matrix double array.
-	 * 
-	 * This is much faster than decompressing into a target matrix block since nnz is not managed.
-	 * 
-	 * @param target    Target double array to decompress into
-	 * @param colIndex  The column index to decompress.
-	 * @param rl        The row to start decompression from
-	 * @param ru        The row to end the decompression at
-	 * @param colGroups The list of column groups to decompress.
-	 */
-	public static void decompressColumnToBlock(double[] target, int colIndex, int rl, int ru,
-		List<AColGroup> colGroups) {
-		for(AColGroup g : colGroups) {
-			int groupColIndex = Arrays.binarySearch(g._colIndexes, colIndex);
-			if(groupColIndex >= 0) {
-				g.decompressColumnToBlock(target, groupColIndex, rl, ru);
-			}
-		}
-	}
-
-	/**
-	 * Decompress to block.
-	 * 
-	 * @param target dense output vector
-	 * @param colpos column to decompress, error if larger or equal numCols
-	 */
-	public abstract void decompressColumnToBlock(MatrixBlock target, int colpos);
-
-	/**
-	 * Decompress to block.
-	 * 
-	 * @param target dense output vector
-	 * @param colpos column to decompress, error if larger or equal numCols
-	 * @param rl     the Row to start decompression from
-	 * @param ru     the Row to end decompression at
-	 */
-	public abstract void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru);
-
-	/**
-	 * Decompress to dense array.
-	 * 
-	 * @param target dense output vector double array.
-	 * @param colpos column to decompress, error if larger or equal numCols
-	 * @param rl     the Row to start decompression from
-	 * @param ru     the Row to end decompression at
-	 */
-	public abstract void decompressColumnToBlock(double[] target, int colpos, int rl, int ru);
-
-	/**
 	 * Serializes column group to data output.
 	 * 
 	 * @param out data output
@@ -477,7 +280,7 @@ public abstract class AColGroup implements Serializable {
 	public abstract AColGroup rightMultByMatrix(MatrixBlock right);
 
 	/**
-	 * Do a transposed self matrix multiplication, but only with this column group.
+	 * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group.
 	 * 
 	 * This gives better performance since there is no need to iterate through all the rows of the matrix, but the
 	 * execution can be limited to its number of distinct values.
@@ -490,13 +293,29 @@ public abstract class AColGroup implements Serializable {
 	public abstract void tsmm(double[] result, int numColumns);
 
 	/**
+	 * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group, and only
+	 * on a subset of the columns contained in this columnGroup.
+	 * 
+	 * This gives better performance since there is no need to iterate through all the rows of the matrix, but the
+	 * execution can be limited to its number of distinct values.
+	 * 
+	 * Note it only calculate the upper triangle
+	 * 
+	 * @param result     A row major dense allocation of a matrixBlock, of size [numColumns x numColumns]
+	 * @param numColumns The number of columns in the row major result matrix.
+	 * @param idxStart   The starting index in the _colIndexes.
+	 * @param idxEnd     The ending index in the _colIndexes.
+	 */
+	public abstract void tsmm(double[] result, int numColumns, int idxStart, int idxEnd);
+
+	/**
 	 * Left multiply with this column group
 	 * 
 	 * @param matrix The matrix to multiply with on the left
 	 * @param result The result to output the values into, always dense for the purpose of the column groups
 	 *               parallelizing
 	 */
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result) {
+	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result) {
 		leftMultByMatrix(matrix, result, 0, matrix.getNumRows());
 	}
 
@@ -570,14 +389,6 @@ public abstract class AColGroup implements Serializable {
 	public abstract void countNonZerosPerRow(int[] rnnz, int rl, int ru);
 
 	/**
-	 * Base class for column group row iterators. We do not implement the default Iterator interface in order to avoid
-	 * unnecessary value copies per group.
-	 */
-	protected abstract class ColGroupRowIterator {
-		public abstract void next(double[] buff, int rowIx, int segIx, boolean last);
-	}
-
-	/**
 	 * Is Lossy
 	 * 
 	 * @return returns if the ColGroup is compressed in a lossy manner.
@@ -603,7 +414,7 @@ public abstract class AColGroup implements Serializable {
 	 * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced
 	 *         dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix.
 	 */
-	public AColGroup sliceColumns(int cl, int cu) {
+	public final AColGroup sliceColumns(int cl, int cu) {
 		AColGroup ret = (cu - cl == 1) ? sliceColumn(cl) : sliceMultiColumns(cl, cu);
 		return ret;
 	}
@@ -615,7 +426,7 @@ public abstract class AColGroup implements Serializable {
 	 * @return A new column group that is a single column, if the column requested is not in this column group null is
 	 *         returned.
 	 */
-	public AColGroup sliceColumn(int col) {
+	public final AColGroup sliceColumn(int col) {
 		int idx = Arrays.binarySearch(_colIndexes, col);
 		if(idx >= 0)
 			return sliceSingleColumn(idx);
@@ -631,7 +442,7 @@ public abstract class AColGroup implements Serializable {
 	 * @return A column group of this containing the columns specified, returns null if the columns specified is not
 	 *         contained in the column group
 	 */
-	protected AColGroup sliceMultiColumns(int cl, int cu) {
+	protected final AColGroup sliceMultiColumns(int cl, int cu) {
 		int idStart = 0;
 		int idEnd = 0;
 		for(int i = 0; i < _colIndexes.length; i++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
index 3b598e6..a643c45 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
@@ -24,7 +24,6 @@ import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
 import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
-import org.apache.sysds.runtime.functionobjects.Mean;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.ReduceAll;
 import org.apache.sysds.runtime.functionobjects.ReduceCol;
@@ -64,20 +63,13 @@ public abstract class ColGroupCompressed extends AColGroup {
 
 	public abstract boolean isLossy();
 
-	/**
-	 * if -1 is returned it means false, otherwise it returns an index where the zero tuple can be found.
-	 * 
-	 * @return A Index where the zero tuple can be found.
-	 */
-	protected abstract int containsAllZeroTuple();
-
 	protected abstract double computeMxx(double c, Builtin builtin);
 
 	protected abstract void computeColMxx(double[] c, Builtin builtin);
 
 	protected abstract void computeSum(double[] c, boolean square);
 
-	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean);
+	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru);
 
 	protected abstract void computeColSums(double[] c, boolean square);
 
@@ -102,30 +94,31 @@ public abstract class ColGroupCompressed extends AColGroup {
 
 	@Override
 	public final void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int rl, int ru) {
-		// sum and sumsq (reduceall/reducerow over tuples and counts)
 		if(op.aggOp.increOp.fn instanceof Plus || op.aggOp.increOp.fn instanceof KahanPlus ||
 			op.aggOp.increOp.fn instanceof KahanPlusSq) {
 			boolean square = op.aggOp.increOp.fn instanceof KahanPlusSq;
-			boolean mean = op.aggOp.increOp.fn instanceof Mean;
 			if(op.indexFn instanceof ReduceAll)
 				computeSum(c, square);
 			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(c, square, rl, ru, mean);
+				computeRowSums(c, square, rl, ru);
 			else if(op.indexFn instanceof ReduceRow)
 				computeColSums(c, square);
 		}
-		// min and max (reduceall/reducerow over tuples only)
-		else if(op.aggOp.increOp.fn instanceof Builtin &&
-			(((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX ||
-				((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MIN)) {
-			Builtin builtin = (Builtin) op.aggOp.increOp.fn;
+		else if(op.aggOp.increOp.fn instanceof Builtin) {
+			Builtin bop = (Builtin) op.aggOp.increOp.fn;
+			BuiltinCode bopC = bop.getBuiltinCode();
+			if(bopC == BuiltinCode.MAX || bopC == BuiltinCode.MIN) {
+				if(op.indexFn instanceof ReduceAll)
+					c[0] = computeMxx(c[0], bop);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowMxx(c, bop, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColMxx(c, bop);
+			}
+			else {
+				throw new DMLScriptException("unsupported builtin type: " + bop);
+			}
 
-			if(op.indexFn instanceof ReduceAll)
-				c[0] = computeMxx(c[0], builtin);
-			else if(op.indexFn instanceof ReduceCol)
-				computeRowMxx(c, builtin, rl, ru);
-			else if(op.indexFn instanceof ReduceRow)
-				computeColMxx(c, builtin);
 		}
 		else {
 			throw new DMLScriptException("Unknown UnaryAggregate operator on CompressedMatrixBlock");
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index d439c4e..019c6e1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -25,8 +25,6 @@ import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
-import org.apache.sysds.runtime.compress.colgroup.pre.ArrPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -46,20 +44,6 @@ public class ColGroupConst extends ColGroupValue {
 		super(numRows);
 	}
 
-	public static ColGroupConst genColGroupConst(int numRows, int numCols, double value) {
-
-		int[] colIndices = new int[numCols];
-		for(int i = 0; i < numCols; i++)
-			colIndices[i] = i;
-
-		double[] values = new double[numCols];
-		for(int i = 0; i < numCols; i++)
-			values[i] = value;
-
-		ADictionary dict = new Dictionary(values);
-		return new ColGroupConst(colIndices, numRows, dict);
-	}
-
 	/**
 	 * Constructs an Constant Colum Group, that contains only one tuple, with the given value.
 	 * 
@@ -84,18 +68,13 @@ public class ColGroupConst extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
 		for(int rix = rl; rix < ru; rix++)
 			c[rix] += vals;
 	}
 
 	@Override
-	protected void computeColSums(double[] c, boolean square) {
-		_dict.colSum(c, getCounts(), _colIndexes, square);
-	}
-
-	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		double value = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
 		for(int i = rl; i < ru; i++)
@@ -113,15 +92,9 @@ public class ColGroupConst extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(_colIndexes.length * target.getNumRows() + target.getNonZeros());
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
 		offT = offT * target.getNumColumns();
 		for(int i = rl; i < ru; i++, offT += target.getNumColumns())
 			for(int j = 0; j < _colIndexes.length; j++)
@@ -129,113 +102,156 @@ public class ColGroupConst extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		int ncol = getNumCols();
-		double[] values = getValues();
-		for(int i = 0; i < _numRows; i++)
-			for(int colIx = 0; colIx < ncol; colIx++) {
-				int origMatrixColIx = getColIndex(colIx);
-				int col = colIndexTargets[origMatrixColIx];
-				double cellVal = values[colIx];
-				target.quickSetValue(i, col, target.quickGetValue(i, col) + cellVal);
-			}
-
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock values) {
+		throw new NotImplementedException();
 	}
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colPos) {
-		double[] c = target.getDenseBlockValues();
-		double v = _dict.getValue(colPos);
-		if(v != 0)
-			for(int i = 0; i < c.length; i++)
-				c[i] += v;
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	int ncol = getNumCols();
+	// 	double[] values = getValues();
+	// 	for(int i = 0; i < _numRows; i++)
+	// 		for(int colIx = 0; colIx < ncol; colIx++) {
+	// 			int origMatrixColIx = _colIndexes[colIx];
+	// 			int col = colIndexTargets[origMatrixColIx];
+	// 			double cellVal = values[colIx];
+	// 			target.quickSetValue(i, col, target.quickGetValue(i, col) + cellVal);
+	// 		}
 
-		target.setNonZeros(_numRows);
+	// }
 
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colPos) {
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double v = _dict.getValue(colPos);
+	// 	if(v != 0)
+	// 		for(int i = 0; i < c.length; i++)
+	// 			c[i] += v;
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colPos, int rl, int ru) {
-		double[] c = target.getDenseBlockValues();
-		double v = _dict.getValue(colPos);
-		final int length = ru - rl;
-		if(v != 0)
-			for(int i = 0; i < length; i++)
-				c[i] += v;
+	// 	target.setNonZeros(_numRows);
 
-		target.setNonZeros(_numRows);
-	}
+	// }
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colPos, int rl, int ru) {
-		double v = _dict.getValue(colPos);
-		final int length = ru - rl;
-		if(v != 0)
-			for(int i = 0; i < length; i++)
-				c[i] += v;
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colPos, int rl, int ru) {
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double v = _dict.getValue(colPos);
+	// 	final int length = ru - rl;
+	// 	if(v != 0)
+	// 		for(int i = 0; i < length; i++)
+	// 			c[i] += v;
 
-	}
+	// 	target.setNonZeros(_numRows);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colPos, int rl, int ru) {
+	// 	double v = _dict.getValue(colPos);
+	// 	final int length = ru - rl;
+	// 	if(v != 0)
+	// 		for(int i = 0; i < length; i++)
+	// 			c[i] += v;
+
+	// }
 
 	@Override
 	public double get(int r, int c) {
 		return _dict.getValue(Arrays.binarySearch(_colIndexes, c));
 	}
 
-	public double[] preAggregate(double[] a, int row) {
-		return new double[] {preAggregateSingle(a, row)};
-	}
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// return new double[] {preAggregateSingle(a, row)};
+	// }
 
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
-		return new double[] {preAggregateSparseSingle(sb, row)};
-	}
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// return new double[] {preAggregateSparseSingle(sb, row)};
+	// }
 
-	public double preAggregateSparseSingle(SparseBlock sb, int row) {
-		double v = 0;
-		double[] sparseV = sb.values(row);
-		for(int i = sb.pos(row); i < sb.pos(row) + sb.size(row); i++) {
-			v += sparseV[i];
-		}
-		return v;
-	}
-
-	private double preAggregateSingle(double[] a, int row) {
-		double vals = 0;
-		for(int off = _numRows * row; off < _numRows * row + _numRows; off++)
-			vals += a[off];
-		return vals;
+	@Override
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
+		else
+			preAggregateDense(m, preAgg, rl, ru);
 	}
 
-	@Override
-	public void leftMultByMatrix(MatrixBlock a, MatrixBlock c, int rl, int ru) {
-		final double[] cV = c.getDenseBlockValues();
-		final double[] values = getValues();
-		if(values == null  || a.isEmpty())
-			return;
-		else if(a.isInSparseFormat()) {
-			SparseBlock sb = a.getSparseBlock();
-			for(int i = rl; i < ru; i++) {
-
-				if(!sb.isEmpty(i)) {
-					double v = preAggregateSparseSingle(sb, i);
-					int offC = i * c.getNumColumns();
-					for(int j = 0; j < _colIndexes.length; j++)
-						cV[offC + _colIndexes[j]] += v * values[j];
-
-				}
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			for(int rc = 0, offLeft = rowLeft * _numRows; rc < _numRows; rc++, offLeft++) {
+				preAV[offOut] += mV[offLeft];
 			}
 		}
-		else {
-			double[] aV = a.getDenseBlockValues();
-			for(int i = rl; i < ru; i++) {
-				double preAggVals = preAggregateSingle(aV, i);
-				int offC = i * c.getNumColumns();
-				for(int j = 0; j < _colIndexes.length; j++)
-					cV[offC + _colIndexes[j]] += preAggVals * values[j];
+	}
 
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final double[] avals = sb.values(rowLeft);
+			for(int j = apos; j < alen; j++) {
+				preAV[offOut] += avals[j];
 			}
 		}
 	}
 
+
+	// public double preAggregateSparseSingle(SparseBlock sb, int row) {
+	// double v = 0;
+	// double[] sparseV = sb.values(row);
+	// for(int i = sb.pos(row); i < sb.pos(row) + sb.size(row); i++) {
+	// v += sparseV[i];
+	// }
+	// return v;
+	// }
+
+	// private double preAggregateSingle(double[] a, int row) {
+	// double vals = 0;
+	// for(int off = _numRows * row; off < _numRows * row + _numRows; off++)
+	// vals += a[off];
+	// return vals;
+	// }
+
+	// @Override
+	// public void leftMultByMatrix(MatrixBlock a, MatrixBlock c, int rl, int ru) {
+	// final double[] cV = c.getDenseBlockValues();
+	// final double[] values = getValues();
+	// if(values == null || a.isEmpty())
+	// return;
+	// else if(a.isInSparseFormat()) {
+	// SparseBlock sb = a.getSparseBlock();
+	// for(int i = rl; i < ru; i++) {
+
+	// if(!sb.isEmpty(i)) {
+	// double v = preAggregateSparseSingle(sb, i);
+	// int offC = i * c.getNumColumns();
+	// for(int j = 0; j < _colIndexes.length; j++)
+	// cV[offC + _colIndexes[j]] += v * values[j];
+
+	// }
+	// }
+	// }
+	// else {
+	// double[] aV = a.getDenseBlockValues();
+	// for(int i = rl; i < ru; i++) {
+	// double preAggVals = preAggregateSingle(aV, i);
+	// int offC = i * c.getNumColumns();
+	// for(int j = 0; j < _colIndexes.length; j++)
+	// cV[offC + _colIndexes[j]] += preAggVals * values[j];
+
+	// }
+	// }
+	// }
+
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
 		return new ColGroupConst(_colIndexes, _numRows, applyScalarOp(op));
@@ -264,40 +280,40 @@ public class ColGroupConst extends ColGroupValue {
 		throw new NotImplementedException("This function should not be called");
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		return new ArrPreAggregate(lhs.getCounts());
-	}
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	return new ArrPreAggregate(lhs.getCounts());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -325,17 +341,7 @@ public class ColGroupConst extends ColGroupValue {
 	}
 
 	@Override
-	protected int containsAllZeroTuple() {
-		return -1;
-	}
-
-	@Override
 	protected boolean sameIndexStructure(ColGroupCompressed that) {
 		return that instanceof ColGroupEmpty || that instanceof ColGroupConst;
 	}
-
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index 42cf04b..6cdbe4e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -24,17 +24,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
-import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -65,32 +59,10 @@ public class ColGroupDDC extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(target.getNonZeros() + _numRows * _colIndexes.length);
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
-		final int nCol = _colIndexes.length;
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock sb) {
 		final int tCol = target.getNumColumns();
 		final double[] c = target.getDenseBlockValues();
-		if(_dict instanceof MatrixBlockDictionary) {
-			MatrixBlock dmb = ((MatrixBlockDictionary) _dict).getMatrixBlock();
-			if(dmb.isEmpty())
-				return;
-			else if(dmb.isInSparseFormat())
-				decompressToBlockUnsafeSparse(c, rl, ru, offT, dmb.getSparseBlock(), tCol, nCol);
-			else
-				decompressToBlockUnsafeDense(c, rl, ru, offT, dmb.getDenseBlockValues(), tCol, nCol);
-		}
-		else
-			decompressToBlockUnsafeDense(c, rl, ru, offT, getValues(), tCol, nCol);
-
-	}
-
-	private void decompressToBlockUnsafeSparse(double[] c, int rl, int ru, int offT, SparseBlock sb, int tCol,
-		int nCol) {
 		offT = offT * tCol;
 		for(int i = rl; i < ru; i++, offT += tCol) {
 			final int rowIndex = _data.getIndex(i);
@@ -106,9 +78,12 @@ public class ColGroupDDC extends ColGroupValue {
 		}
 	}
 
-	private void decompressToBlockUnsafeDense(double[] c, int rl, int ru, int offT, double[] values, int tCol,
-		int nCol) {
-		// final double[] values = getValues();
+	@Override
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
+		final int nCol = _colIndexes.length;
+		final int tCol = target.getNumColumns();
+		final double[] c = target.getDenseBlockValues();
 		offT = offT * tCol;
 
 		for(int i = rl; i < ru; i++, offT += tCol) {
@@ -118,67 +93,67 @@ public class ColGroupDDC extends ColGroupValue {
 		}
 	}
 
-	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		int ncol = getNumCols();
-		double[] dictionary = getValues();
-		for(int i = 0; i < _numRows; i++) {
-			int rowIndex = _data.getIndex(i) * ncol;
-			for(int colIx = 0; colIx < ncol; colIx++) {
-				int origMatrixColIx = getColIndex(colIx);
-				int col = colIndexTargets[origMatrixColIx];
-				double cellVal = dictionary[rowIndex + colIx];
-				target.quickSetValue(i, col, target.quickGetValue(i, col) + cellVal);
-			}
-
-		}
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		int ncol = getNumCols();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-		int nnz = 0;
-		for(int i = 0; i < _numRows; i++) {
-			int index = _data.getIndex(i);
-			if(index < getNumValues())
-				nnz += ((c[i] += values[(index) * ncol + colpos]) != 0) ? 1 : 0;
-			else
-				nnz++;
-
-		}
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		int ncol = getNumCols();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-		final int numValues = getNumValues();
-		int nnz = 0;
-		for(int i = 0, r = rl; i < ru - rl; i++, r++) {
-			int index = _data.getIndex(r);
-			if(index < numValues)
-				nnz += ((c[i] += values[(index) * ncol + colpos]) != 0) ? 1 : 0;
-			else
-				nnz++;
-		}
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		int ncol = getNumCols();
-		double[] values = getValues();
-		final int numValues = getNumValues();
-		for(int i = 0, r = rl; i < ru - rl; i++, r++) {
-			int index = _data.getIndex(r);
-			if(index < numValues)
-				c[i] += values[(index) * ncol + colpos];
-		}
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	int ncol = getNumCols();
+	// 	double[] dictionary = getValues();
+	// 	for(int i = 0; i < _numRows; i++) {
+	// 		int rowIndex = _data.getIndex(i) * ncol;
+	// 		for(int colIx = 0; colIx < ncol; colIx++) {
+	// 			int origMatrixColIx = _colIndexes[colIx];
+	// 			int col = colIndexTargets[origMatrixColIx];
+	// 			double cellVal = dictionary[rowIndex + colIx];
+	// 			target.quickSetValue(i, col, target.quickGetValue(i, col) + cellVal);
+	// 		}
+
+	// 	}
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	int ncol = getNumCols();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double[] values = getValues();
+	// 	int nnz = 0;
+	// 	for(int i = 0; i < _numRows; i++) {
+	// 		int index = _data.getIndex(i);
+	// 		if(index < getNumValues())
+	// 			nnz += ((c[i] += values[(index) * ncol + colpos]) != 0) ? 1 : 0;
+	// 		else
+	// 			nnz++;
+
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	int ncol = getNumCols();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double[] values = getValues();
+	// 	final int numValues = getNumValues();
+	// 	int nnz = 0;
+	// 	for(int i = 0, r = rl; i < ru - rl; i++, r++) {
+	// 		int index = _data.getIndex(r);
+	// 		if(index < numValues)
+	// 			nnz += ((c[i] += values[(index) * ncol + colpos]) != 0) ? 1 : 0;
+	// 		else
+	// 			nnz++;
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	int ncol = getNumCols();
+	// 	double[] values = getValues();
+	// 	final int numValues = getNumValues();
+	// 	for(int i = 0, r = rl; i < ru - rl; i++, r++) {
+	// 		int index = _data.getIndex(r);
+	// 		if(index < numValues)
+	// 			c[i] += values[(index) * ncol + colpos];
+	// 	}
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -214,7 +189,7 @@ public class ColGroupDDC extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
 		for(int rix = rl; rix < ru; rix++)
 			c[rix] += vals[_data.getIndex(rix)];
@@ -242,54 +217,89 @@ public class ColGroupDDC extends ColGroupValue {
 		return counts;
 	}
 
-	@Override
-	public double[] preAggregate(double[] a, int row) {
-		double[] vals = allocDVector(getNumValues(), true);
-		if(row > 0)
-			for(int i = 0, off = _numRows * row; i < _numRows; i++, off++)
-				vals[_data.getIndex(i)] += a[off];
-		else
-			for(int i = 0; i < _numRows; i++)
-				vals[_data.getIndex(i)] += a[i];
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// double[] vals = allocDVector(getNumValues(), true);
+	// if(row > 0)
+	// for(int i = 0, off = _numRows * row; i < _numRows; i++, off++)
+	// vals[_data.getIndex(i)] += a[off];
+	// else
+	// for(int i = 0; i < _numRows; i++)
+	// vals[_data.getIndex(i)] += a[i];
 
-		return vals;
-	}
+	// return vals;
+	// }
 
-	@Override
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
 
-		double[] vals = allocDVector(getNumValues(), true);
-		int[] indexes = sb.indexes(row);
-		double[] sparseV = sb.values(row);
-		for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++)
-			vals[_data.getIndex(indexes[i])] += sparseV[i];
-		return vals;
+	// double[] vals = allocDVector(getNumValues(), true);
+	// int[] indexes = sb.indexes(row);
+	// double[] sparseV = sb.values(row);
+	// for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++)
+	// vals[_data.getIndex(indexes[i])] += sparseV[i];
+	// return vals;
 
-	}
+	// }
 
 	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-
-		final int retCols = getNumValues();
-		final int retRows = ru - rl;
-		final double[] vals = allocDVector(retRows * retCols, true);
-		final DenseBlock retB = new DenseBlockFP64(new int[] {retRows, retCols}, vals);
-		final MatrixBlock ret = new MatrixBlock(retRows, retCols, retB);
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
+		else
+			preAggregateDense(m, preAgg, rl, ru);
+	}
 
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
 		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			for(int rc = 0, offLeft = rowLeft * _numRows; rc < _numRows; rc++, offLeft++) {
+				preAV[offOut + _data.getIndex(rc)] += mV[offLeft];
+			}
+		}
+	}
 
-		ret.setNonZeros(retRows * retCols);
-		for(int k = rl; k < ru; k++) {
-			final int offT = ret.getNumColumns() * k;
-			final int offM = m.getNumColumns() * k;
-			for(int i = 0; i < _numRows; i++) {
-				int index = _data.getIndex(i);
-				vals[offT + index] += mV[offM + i];
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final int[] aix = sb.indexes(rowLeft);
+			final double[] avals = sb.values(rowLeft);
+			for(int j = apos; j < alen; j++) {
+				preAV[offOut + _data.getIndex(aix[j])] += avals[j];
 			}
 		}
-		return ret;
 	}
 
+	// @Override
+	// public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
+
+	// final int retCols = getNumValues();
+	// final int retRows = ru - rl;
+	// final double[] vals = allocDVector(retRows * retCols, true);
+	// final DenseBlock retB = new DenseBlockFP64(new int[] {retRows, retCols}, vals);
+	// final MatrixBlock ret = new MatrixBlock(retRows, retCols, retB);
+
+	// final double[] mV = m.getDenseBlockValues();
+
+	// ret.setNonZeros(retRows * retCols);
+	// for(int k = rl; k < ru; k++) {
+	// final int offT = ret.getNumColumns() * k;
+	// final int offM = m.getNumColumns() * k;
+	// for(int i = 0; i < _numRows; i++) {
+	// int index = _data.getIndex(i);
+	// vals[offT + index] += mV[offM + i];
+	// }
+	// }
+	// return ret;
+	// }
+
 	/**
 	 * Generic get value for byte-length-agnostic access to first column.
 	 * 
@@ -325,158 +335,158 @@ public class ColGroupDDC extends ColGroupValue {
 		_data.set(r, code);
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		// int[] m = _data.materializeMultiplied(nCol);
-		for(int i = 0; i < this._numRows; i++)
-			ag.increment(lhs._data.getIndex(i) + this._data.getIndex(i) * nCol);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		AIterator lIt = lhs._indexes.getIterator();
-		final int offsetToDefault = nCol - 1;
-
-		int i = 0;
-
-		int col;
-		for(; i < this._numRows && lIt.hasNext(); i++) {
-			int row = this._data.getIndex(i);
-			if(lIt.value() == i)
-				col = lhs._data.getIndex(lIt.getDataIndexAndIncrement());
-
-			else
-				col = offsetToDefault;
-			ag.increment(col + row * nCol);
-		}
-		col = offsetToDefault;
-		for(; i < this._numRows; i++) {
-			int row = this._data.getIndex(i);
-			ag.increment(col + row * nCol);
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator lIt = lhs._indexes.getIterator();
-
-		int i = 0;
-
-		int col;
-		for(; i < this._numRows && lIt.hasNext(); i++) {
-			int row = this._data.getIndex(i);
-			if(lIt.value() == i) {
-				col = 1;
-				lIt.next();
-			}
-			else
-				col = 0;
-			ag.increment(col + row * nCol);
-		}
-
-		for(; i < this._numRows; i++)
-			ag.increment(this._data.getIndex(i) * nCol);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator lIt = lhs._indexes.getIterator();
-
-		while(lIt.hasNext()) {
-			int row = this._data.getIndex(lIt.value());
-			int col = lhs._data.getIndex(lIt.getDataIndexAndIncrement());
-			ag.increment(col + row * nCol);
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		final AIterator lIt = lhs._indexes.getIterator();
-
-		while(lIt.hasNext()) {
-			int row = this._data.getIndex(lIt.value());
-			lIt.next();
-			ag.increment(row);
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kl = 0; kl < NVL; kl++) {
-			final int bOffL = lhs._ptr[kl];
-			final int bLenL = lhs.len(kl);
-			for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
-				sLenL = lhs._data[bOffL + bixL];
-				for(int i = 1; i <= sLenL; i++) {
-					int idx = this._data.getIndex(offL + lhs._data[bOffL + bixL + i]);
-					ag.increment(kl + idx * NVL);
-				}
-			}
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kl = 0; kl < NVL; kl++) {
-			final int boffL = lhs._ptr[kl];
-			final int blenL = lhs.len(kl);
-			for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
-				startL += lhs._data[boffL + bixL];
-				lenL = lhs._data[boffL + bixL + 1];
-				final int endL = startL + lenL;
-				for(int i = startL; i < endL; i++) {
-					int kr = _data.getIndex(i) * NVL;
-					ag.increment(kl + kr);
-				}
-			}
-		}
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	// int[] m = _data.materializeMultiplied(nCol);
+	// 	for(int i = 0; i < this._numRows; i++)
+	// 		ag.increment(lhs._data.getIndex(i) + this._data.getIndex(i) * nCol);
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	AIterator lIt = lhs._indexes.getIterator();
+	// 	final int offsetToDefault = nCol - 1;
+
+	// 	int i = 0;
+
+	// 	int col;
+	// 	for(; i < this._numRows && lIt.hasNext(); i++) {
+	// 		int row = this._data.getIndex(i);
+	// 		if(lIt.value() == i)
+	// 			col = lhs._data.getIndex(lIt.getDataIndexAndIncrement());
+
+	// 		else
+	// 			col = offsetToDefault;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+	// 	col = offsetToDefault;
+	// 	for(; i < this._numRows; i++) {
+	// 		int row = this._data.getIndex(i);
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+
+	// 	int i = 0;
+
+	// 	int col;
+	// 	for(; i < this._numRows && lIt.hasNext(); i++) {
+	// 		int row = this._data.getIndex(i);
+	// 		if(lIt.value() == i) {
+	// 			col = 1;
+	// 			lIt.next();
+	// 		}
+	// 		else
+	// 			col = 0;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	for(; i < this._numRows; i++)
+	// 		ag.increment(this._data.getIndex(i) * nCol);
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+
+	// 	while(lIt.hasNext()) {
+	// 		int row = this._data.getIndex(lIt.value());
+	// 		int col = lhs._data.getIndex(lIt.getDataIndexAndIncrement());
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+
+	// 	while(lIt.hasNext()) {
+	// 		int row = this._data.getIndex(lIt.value());
+	// 		lIt.next();
+	// 		ag.increment(row);
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kl = 0; kl < NVL; kl++) {
+	// 		final int bOffL = lhs._ptr[kl];
+	// 		final int bLenL = lhs.len(kl);
+	// 		for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
+	// 			sLenL = lhs._data[bOffL + bixL];
+	// 			for(int i = 1; i <= sLenL; i++) {
+	// 				int idx = this._data.getIndex(offL + lhs._data[bOffL + bixL + i]);
+	// 				ag.increment(kl + idx * NVL);
+	// 			}
+	// 		}
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kl = 0; kl < NVL; kl++) {
+	// 		final int boffL = lhs._ptr[kl];
+	// 		final int blenL = lhs.len(kl);
+	// 		for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
+	// 			startL += lhs._data[boffL + bixL];
+	// 			lenL = lhs._data[boffL + bixL + 1];
+	// 			final int endL = startL + lenL;
+	// 			for(int i = startL; i < endL; i++) {
+	// 				int kr = _data.getIndex(i) * NVL;
+	// 				ag.increment(kl + kr);
+	// 			}
+	// 		}
+	// 	}
+	// 	return ag;
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -615,7 +625,7 @@ public class ColGroupDDC extends ColGroupValue {
 	@Override
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
-		_data = MapToFactory.readIn(in, getNumValues());
+		_data = MapToFactory.readIn(in);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index 11adb66..e1ade47 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -57,7 +57,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		// do nothing
 	}
 
@@ -83,7 +83,6 @@ public class ColGroupEmpty extends ColGroupCompressed {
 		return ColGroupType.EMPTY;
 	}
 
-
 	@Override
 	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
 		// do nothing.
@@ -94,25 +93,25 @@ public class ColGroupEmpty extends ColGroupCompressed {
 		// do nothing.
 	}
 
-	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		// do nothing.
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	// do nothing.
+	// }
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		// do nothing.
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	// do nothing.
+	// }
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		// do nothing.
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	// do nothing.
+	// }
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		// do nothing.
-	}
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	// do nothing.
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -130,7 +129,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
 		if(val0 == 0)
 			return this;
 		return new ColGroupConst(_colIndexes, _numRows,
-			new Dictionary(new double[_colIndexes.length]).applyScalarOp(op, val0, _colIndexes.length));
+			new Dictionary(new double[_colIndexes.length]).apply(op));
 	}
 
 	@Override
@@ -138,7 +137,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
 		if(sparseSafe)
 			return this;
 		return new ColGroupConst(_colIndexes, _numRows,
-			new Dictionary(new double[_colIndexes.length]).applyBinaryRowOp(op, v, sparseSafe, _colIndexes, left));
+			new Dictionary(new double[_colIndexes.length]).applyBinaryRowOp(op, v, true, _colIndexes, left));
 	}
 
 	@Override
@@ -167,11 +166,6 @@ public class ColGroupEmpty extends ColGroupCompressed {
 	}
 
 	@Override
-	protected int containsAllZeroTuple() {
-		return 0;
-	}
-
-	@Override
 	protected double computeMxx(double c, Builtin builtin) {
 		return builtin.execute(c, 0);
 	}
@@ -200,7 +194,11 @@ public class ColGroupEmpty extends ColGroupCompressed {
 	@Override
 	public void tsmm(double[] result, int numColumns) {
 		// do nothing
+	}
 
+	@Override
+	public void tsmm(double[] result, int numColumns, int idxStart, int idxEnd) {
+		// do nothing.
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index ffcc2fb..32cebf8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -37,11 +37,12 @@ import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.insertionsort.AInsertionSorter;
+import org.apache.sysds.runtime.compress.colgroup.insertionsort.InsertionSorterFactory;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
-import org.apache.sysds.runtime.compress.colgroup.tree.AInsertionSorter;
-import org.apache.sysds.runtime.compress.colgroup.tree.InsertionSorterFactory;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorExact;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
@@ -56,7 +57,7 @@ import org.apache.sysds.runtime.util.CommonThreadPool;
 /**
  * Factory pattern for constructing ColGroups.
  */
-public class ColGroupFactory {
+public final class ColGroupFactory {
 	private static final Log LOG = LogFactory.getLog(ColGroupFactory.class.getName());
 
 	/**
@@ -156,7 +157,8 @@ public class ColGroupFactory {
 	private static Collection<AColGroup> compressColGroup(MatrixBlock in, int[] colIndexes,
 		CompressionSettings compSettings) {
 		if(in.isEmpty())
-			return Collections.singletonList(new ColGroupEmpty(colIndexes, compSettings.transposed ? in.getNumColumns(): in.getNumRows()));
+			return Collections.singletonList(
+				new ColGroupEmpty(colIndexes, compSettings.transposed ? in.getNumColumns() : in.getNumRows()));
 		else if(in.isInSparseFormat() && compSettings.transposed) {
 			final SparseBlock sb = in.getSparseBlock();
 			for(int col : colIndexes)
@@ -283,7 +285,7 @@ public class ColGroupFactory {
 	public static AColGroup compress(int[] colIndexes, int rlen, ABitmap ubm, CompressionType compType,
 		CompressionSettings cs, MatrixBlock rawMatrixBlock, double tupleSparsity) {
 
-		if(compType == CompressionType.UNCOMPRESSED && cs.columnPartitioner == PartitionerType.COST_MATRIX_MULT)
+		if(compType == CompressionType.UNCOMPRESSED && PartitionerType.isCostBased(cs.columnPartitioner))
 			compType = CompressionType.DDC;
 
 		final IntArrayList[] of = ubm.getOffsetList();
@@ -355,31 +357,34 @@ public class ColGroupFactory {
 
 	private static AColGroup setupMultiValueZeroColGroup(int[] colIndexes, ABitmap ubm, int numRows, ADictionary dict) {
 		IntArrayList[] offsets = ubm.getOffsetList();
+		try {
+			final int numOffsets = (int) ubm.getNumOffsets();
+			AInsertionSorter s = InsertionSorterFactory.create(numOffsets, numRows, offsets);
+			int[] _indexes = s.getIndexes();
+			AMapToData _data = s.getData();
 
-		final int numOffsets = (int) ubm.getNumOffsets();
-		AInsertionSorter s = InsertionSorterFactory.create(numOffsets, offsets.length, numRows);
-		s.insert(offsets);
-		int[] _indexes = s.getIndexes();
-		AMapToData _data = s.getData();
-
-		return new ColGroupSDCZeros(colIndexes, numRows, dict, _indexes, _data, null);
+			return new ColGroupSDCZeros(colIndexes, numRows, dict, _indexes, _data, null);
+		}
+		catch(Exception e) {
+			throw new DMLCompressionException(
+				"Failed to construct SDC Zero Group with columns :" + Arrays.toString(colIndexes), e);
+		}
 	}
 
 	private static AColGroup setupMultiValueColGroup(int[] colIndexes, int numZeros, int largestOffset, ABitmap ubm,
 		int numRows, int largestIndex, ADictionary dict) {
-		IntArrayList[] offsets = ubm.getOffsetList();
-
-		AInsertionSorter s = InsertionSorterFactory.create(numRows - largestOffset, offsets.length, numRows);
-		s.insert(offsets, largestIndex);
-		int[] _indexes = s.getIndexes();
-		AMapToData _data = s.getData();
 		try {
+			IntArrayList[] offsets = ubm.getOffsetList();
+
+			AInsertionSorter s = InsertionSorterFactory.create(numRows - largestOffset, numRows, offsets, largestIndex);
+			int[] _indexes = s.getIndexes();
+			AMapToData _data = s.getData();
 			AColGroup ret = new ColGroupSDC(colIndexes, numRows, dict, _indexes, _data, null);
 			return ret;
 		}
 		catch(Exception e) {
-			LOG.error(Arrays.toString(_indexes));
-			throw new DMLCompressionException(e);
+			throw new DMLCompressionException(
+				"Failed to construct SDC Group with columns :" + Arrays.toString(colIndexes), e);
 		}
 
 	}
@@ -406,24 +411,10 @@ public class ColGroupFactory {
 
 	private static AColGroup compressDDC(int[] colIndexes, int rlen, ABitmap ubm, CompressionSettings cs,
 		double tupleSparsity) {
-
-		boolean _zeros = ubm.getNumOffsets() < (long) rlen;
-		ADictionary dict = (_zeros) ? DictionaryFactory.createWithAppendedZeroTuple(ubm,
-			tupleSparsity) : DictionaryFactory.create(ubm, tupleSparsity);
-		int numVals = ubm.getNumValues();
-		AMapToData _data = MapToFactory.create(rlen, numVals + (_zeros ? 1 : 0));
-		if(_zeros)
-			_data.fill(numVals);
-
-		for(int i = 0; i < numVals; i++) {
-			IntArrayList tmpList = ubm.getOffsetsList(i);
-			final int sz = tmpList.size();
-			for(int k = 0; k < sz; k++)
-				_data.set(tmpList.get(k), i);
-		}
-
-		return new ColGroupDDC(colIndexes, rlen, dict, _data, null);
-
+		boolean zeros = ubm.getNumOffsets() < (long) rlen;
+		ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity, zeros);
+		AMapToData data = MapToFactory.create(rlen, zeros, ubm.getOffsetList());
+		return new ColGroupDDC(colIndexes, rlen, dict, data, null);
 	}
 
 	private static AColGroup compressOLE(int[] colIndexes, int rlen, ABitmap ubm, CompressionSettings cs,
@@ -469,4 +460,21 @@ public class ColGroupFactory {
 		rle._colIndexes = colIndexes;
 		return rle;
 	}
+
+	public static AColGroup genColGroupConst(int numRows, int numCols, double value) {
+
+		int[] colIndices = new int[numCols];
+		for(int i = 0; i < numCols; i++)
+			colIndices[i] = i;
+
+		if(value == 0)
+			return new ColGroupEmpty(colIndices, numRows);
+
+		double[] values = new double[numCols];
+		for(int i = 0; i < numCols; i++)
+			values[i] = value;
+
+		ADictionary dict = new Dictionary(values);
+		return new ColGroupConst(colIndices, numRows, dict);
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index 712a574..e00422b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -26,10 +26,6 @@ import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
-import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
-import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -70,58 +66,14 @@ public class ColGroupOLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// cache blocking config and position array
-		int[] apos = skipScan(numVals, rl);
-		double[] c = target.getDenseBlockValues();
-		// cache conscious append via horizontal scans
-		for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-
-				if(bix >= blen)
-					continue;
-				int len = _data[boff + bix];
-				int pos = boff + bix + 1;
-				for(int i = pos; i < pos + len; i++) {
-					int row = bi + _data[i];
-					if(row >= rl && row < ru) {
-						int rix = row - (rl - offT);
-						int rc = rix * target.getNumColumns();
-						for(int j = 0; j < numCols; j++) {
-							double v = c[rc + _colIndexes[j]];
-							double nv = c[rc + _colIndexes[j]] + values[off + j];
-							if(v == 0.0 && nv != 0.0) {
-								target.setNonZeros(target.getNonZeros() + 1);
-							}
-							c[rc + _colIndexes[j]] = nv;
-
-						}
-					}
-				}
-				apos[k] += len + 1;
-			}
-		}
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
-
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
 		final int offOut = (rl - offT);
 		final int targetCols = target.getNumColumns();
-		final double[] values = getValues();
-
+		
 		// cache blocking config and position array
 		int[] apos = skipScan(numVals, rl);
 		double[] c = target.getDenseBlockValues();
@@ -153,141 +105,147 @@ public class ColGroupOLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colixTargets) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// cache blocking config and position array
-		int[] apos = new int[numVals];
-		int[] cix = new int[numCols];
-
-		// prepare target col indexes
-		for(int j = 0; j < numCols; j++)
-			cix[j] = colixTargets[_colIndexes[j]];
-
-		// cache conscious append via horizontal scans
-		for(int bi = 0; bi < _numRows; bi += blksz) {
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int len = _data[boff + bix];
-				int pos = boff + bix + 1;
-				for(int i = pos; i < pos + len; i++)
-					for(int j = 0, rix = bi + _data[i]; j < numCols; j++)
-						if(values[off + j] != 0) {
-							double v = target.quickGetValue(rix, _colIndexes[j]);
-							target.setValue(rix, cix[j], values[off + j] + v);
-						}
-				apos[k] += len + 1;
-			}
-		}
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		int numCols = getNumCols();
-		int numVals = getNumValues();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-
-		// cache blocking config and position array
-		int[] apos = new int[numVals];
-
-		// cache conscious append via horizontal scans
-		int nnz = 0;
-		for(int bi = 0; bi < _numRows; bi += blksz) {
-			// Arrays.fill(c, bi, Math.min(bi + blksz, _numRows), 0);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int len = _data[boff + bix];
-				int pos = boff + bix + 1;
-				for(int i = pos; i < pos + len; i++) {
-					c[bi + _data[i]] += values[off + colpos];
-					nnz++;
-				}
-				apos[k] += len + 1;
-			}
-		}
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		int numCols = getNumCols();
-		int numVals = getNumValues();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-
-		// cache blocking config and position array
-		int[] apos = skipScan(numVals, rl);
-
-		// cache conscious append via horizontal scans
-		int nnz = 0;
-		for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int len = _data[boff + bix];
-				int pos = boff + bix + 1;
-				for(int i = pos; i < pos + len; i++) {
-					int index = bi + _data[i];
-					if(index >= rl && index < ru) {
-						c[index - rl] += values[off + colpos];
-						nnz++;
-					}
-				}
-				apos[k] += len + 1;
-			}
-		}
-		target.setNonZeros(nnz);
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock values) {
+		throw new NotImplementedException();
 	}
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		int numCols = getNumCols();
-		int numVals = getNumValues();
-		double[] values = getValues();
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colixTargets) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int numCols = getNumCols();
+	// 	final int numVals = getNumValues();
+	// 	final double[] values = getValues();
+
+	// 	// cache blocking config and position array
+	// 	int[] apos = new int[numVals];
+	// 	int[] cix = new int[numCols];
+
+	// 	// prepare target col indexes
+	// 	for(int j = 0; j < numCols; j++)
+	// 		cix[j] = colixTargets[_colIndexes[j]];
+
+	// 	// cache conscious append via horizontal scans
+	// 	for(int bi = 0; bi < _numRows; bi += blksz) {
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int len = _data[boff + bix];
+	// 			int pos = boff + bix + 1;
+	// 			for(int i = pos; i < pos + len; i++)
+	// 				for(int j = 0, rix = bi + _data[i]; j < numCols; j++)
+	// 					if(values[off + j] != 0) {
+	// 						double v = target.quickGetValue(rix, _colIndexes[j]);
+	// 						target.setValue(rix, cix[j], values[off + j] + v);
+	// 					}
+	// 			apos[k] += len + 1;
+	// 		}
+	// 	}
+	// }
 
-		// cache blocking config and position array
-		int[] apos = skipScan(numVals, rl);
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	int numCols = getNumCols();
+	// 	int numVals = getNumValues();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double[] values = getValues();
+
+	// 	// cache blocking config and position array
+	// 	int[] apos = new int[numVals];
+
+	// 	// cache conscious append via horizontal scans
+	// 	int nnz = 0;
+	// 	for(int bi = 0; bi < _numRows; bi += blksz) {
+	// 		// Arrays.fill(c, bi, Math.min(bi + blksz, _numRows), 0);
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int len = _data[boff + bix];
+	// 			int pos = boff + bix + 1;
+	// 			for(int i = pos; i < pos + len; i++) {
+	// 				c[bi + _data[i]] += values[off + colpos];
+	// 				nnz++;
+	// 			}
+	// 			apos[k] += len + 1;
+	// 		}
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
 
-		// cache conscious append via horizontal scans
-		for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	int numCols = getNumCols();
+	// 	int numVals = getNumValues();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	double[] values = getValues();
+
+	// 	// cache blocking config and position array
+	// 	int[] apos = skipScan(numVals, rl);
+
+	// 	// cache conscious append via horizontal scans
+	// 	int nnz = 0;
+	// 	for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int len = _data[boff + bix];
+	// 			int pos = boff + bix + 1;
+	// 			for(int i = pos; i < pos + len; i++) {
+	// 				int index = bi + _data[i];
+	// 				if(index >= rl && index < ru) {
+	// 					c[index - rl] += values[off + colpos];
+	// 					nnz++;
+	// 				}
+	// 			}
+	// 			apos[k] += len + 1;
+	// 		}
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
 
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int len = _data[boff + bix];
-				int pos = boff + bix + 1;
-				for(int i = pos; i < pos + len; i++) {
-					int index = bi + _data[i];
-					if(index >= rl && index < ru)
-						c[index - rl] += values[off + colpos];
-				}
-				apos[k] += len + 1;
-			}
-		}
-	}
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	int numCols = getNumCols();
+	// 	int numVals = getNumValues();
+	// 	double[] values = getValues();
+
+	// 	// cache blocking config and position array
+	// 	int[] apos = skipScan(numVals, rl);
+
+	// 	// cache conscious append via horizontal scans
+	// 	for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int len = _data[boff + bix];
+	// 			int pos = boff + bix + 1;
+	// 			for(int i = pos; i < pos + len; i++) {
+	// 				int index = bi + _data[i];
+	// 				if(index >= rl && index < ru)
+	// 					c[index - rl] += values[off + colpos];
+	// 			}
+	// 			apos[k] += len + 1;
+	// 		}
+	// 	}
+	// }
 
 	@Override
 	public int[] getCounts(int[] counts) {
@@ -703,7 +661,7 @@ public class ColGroupOLE extends ColGroupOffset {
 	// }
 
 	@Override
-	protected final void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numVals = getNumValues();
@@ -947,46 +905,51 @@ public class ColGroupOLE extends ColGroupOffset {
 		return sb.toString();
 	}
 
-	@Override
-	public double[] preAggregate(double[] a, int row) {
-		final int numVals = getNumValues();
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ * 2;
-
-		int[] apos = allocIVector(numVals, true);
-		double[] cvals = allocDVector(numVals, true);
-		int off = row * _numRows;
-		for(int ai = 0; ai < _numRows; ai += blksz2) {
-			int aimax = Math.min(ai + blksz2, _numRows);
-
-			// horizontal segment scan, incl pos maintenance
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				double vsum = 0;
-
-				for(int ii = ai; ii < aimax && bix < blen; ii += blksz) {
-					// prepare length, start, and end pos
-					int len = _data[boff + bix];
-					int pos = boff + bix + 1;
-
-					// iterate over bitmap blocks and compute partial results (a[i]*1)
-					vsum += LinearAlgebraUtils.vectSum(a, _data, ii + off, pos, len);
-					bix += len + 1;
-				}
-
-				apos[k] = bix;
-				cvals[k] += vsum;
-			}
-		}
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// 	final int numVals = getNumValues();
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ * 2;
+
+	// 	int[] apos = allocIVector(numVals, true);
+	// 	double[] cvals = allocDVector(numVals, true);
+	// 	int off = row * _numRows;
+	// 	for(int ai = 0; ai < _numRows; ai += blksz2) {
+	// 		int aimax = Math.min(ai + blksz2, _numRows);
+
+	// 		// horizontal segment scan, incl pos maintenance
+	// 		for(int k = 0; k < numVals; k++) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			double vsum = 0;
+
+	// 			for(int ii = ai; ii < aimax && bix < blen; ii += blksz) {
+	// 				// prepare length, start, and end pos
+	// 				int len = _data[boff + bix];
+	// 				int pos = boff + bix + 1;
+
+	// 				// iterate over bitmap blocks and compute partial results (a[i]*1)
+	// 				vsum += LinearAlgebraUtils.vectSum(a, _data, ii + off, pos, len);
+	// 				bix += len + 1;
+	// 			}
+
+	// 			apos[k] = bix;
+	// 			cvals[k] += vsum;
+	// 		}
+	// 	}
+
+	// 	return cvals;
+	// }
 
-		return cvals;
-	}
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// 	return null;
+	// }
 
 	@Override
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
-		return null;
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru){
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -1052,138 +1015,138 @@ public class ColGroupOLE extends ColGroupOffset {
 		return encodedBlocks;
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kr = 0; kr < NVR; kr++) {
-			final int bOffR = this._ptr[kr];
-			final int bLenR = this.len(kr);
-			final int krOff = kr * NVL;
-			for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
-				sLenR = this._data[bOffR + bixR];
-				for(int j = 1; j <= sLenR; j++) {
-					int idx = lhs._data.getIndex(offR + this._data[bOffR + bixR + j]);
-					ag.increment(idx + krOff);
-				}
-			}
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		final int defL = NVL - 1;
-
-		for(int kr = 0; kr < NVR; kr++) {
-			AIterator lIt = lhs._indexes.getIterator();
-			final int bOffR = this._ptr[kr];
-			final int bLenR = this.len(kr);
-			final int krOff = kr * NVL;
-			for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
-				sLenR = this._data[bOffR + bixR];
-				for(int j = 1; j <= sLenR; j++) {
-					final int row = offR + this._data[bOffR + bixR + j];
-					lIt.skipTo(row);
-					if(lIt.value() == row)
-						ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + krOff);
-					else
-						ag.increment(defL + krOff);
-				}
-			}
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kr = 0; kr < NVR; kr++) {
+	// 		final int bOffR = this._ptr[kr];
+	// 		final int bLenR = this.len(kr);
+	// 		final int krOff = kr * NVL;
+	// 		for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
+	// 			sLenR = this._data[bOffR + bixR];
+	// 			for(int j = 1; j <= sLenR; j++) {
+	// 				int idx = lhs._data.getIndex(offR + this._data[bOffR + bixR + j]);
+	// 				ag.increment(idx + krOff);
+	// 			}
+	// 		}
+	// 	}
+
+	// 	return ag;
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kr = 0; kr < NVR; kr++) {
-			final AIterator lIt = lhs._indexes.getIterator();
-			final int bOffR = this._ptr[kr];
-			final int bLenR = this.len(kr);
-			final int krOff = kr * NVL;
-			for(int bixR = 0, offR = 0, sLenR = 0; lIt.hasNext() && bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
-				sLenR = this._data[bOffR + bixR];
-				for(int j = 1; lIt.hasNext() && j <= sLenR; j++) {
-					final int row = offR + this._data[bOffR + bixR + j];
-					lIt.skipTo(row);
-					if(lIt.value() == row)
-						ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + krOff);
-				}
-			}
-		}
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	final int defL = NVL - 1;
+
+	// 	for(int kr = 0; kr < NVR; kr++) {
+	// 		AIterator lIt = lhs._indexes.getIterator();
+	// 		final int bOffR = this._ptr[kr];
+	// 		final int bLenR = this.len(kr);
+	// 		final int krOff = kr * NVL;
+	// 		for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
+	// 			sLenR = this._data[bOffR + bixR];
+	// 			for(int j = 1; j <= sLenR; j++) {
+	// 				final int row = offR + this._data[bOffR + bixR + j];
+	// 				lIt.skipTo(row);
+	// 				if(lIt.value() == row)
+	// 					ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + krOff);
+	// 				else
+	// 					ag.increment(defL + krOff);
+	// 			}
+	// 		}
+	// 	}
+
+	// 	return ag;
+	// }
 
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kr = 0; kr < NVR; kr++) {
+	// 		final AIterator lIt = lhs._indexes.getIterator();
+	// 		final int bOffR = this._ptr[kr];
+	// 		final int bLenR = this.len(kr);
+	// 		final int krOff = kr * NVL;
+	// 		for(int bixR = 0, offR = 0, sLenR = 0; lIt.hasNext() && bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
+	// 			sLenR = this._data[bOffR + bixR];
+	// 			for(int j = 1; lIt.hasNext() && j <= sLenR; j++) {
+	// 				final int row = offR + this._data[bOffR + bixR + j];
+	// 				lIt.skipTo(row);
+	// 				if(lIt.value() == row)
+	// 					ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + krOff);
+	// 			}
+	// 		}
+	// 	}
+
+	// 	return ag;
+	// }
 
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kl = 0; kl < NVL; kl++) {
-			final int bOffL = lhs._ptr[kl];
-			final int bLenL = lhs.len(kl);
-			for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
-				sLenL = lhs._data[bOffL + bixL];
-				for(int i = 1; i <= sLenL; i++) {
-					final int col = offL + lhs._data[bOffL + bixL + i];
-					for(int kr = 0; kr < NVR; kr++) {
-						final int bOffR = this._ptr[kr];
-						final int bLenR = this.len(kr);
-						final int krOff = kr * NVL;
-						for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
-							sLenR = this._data[bOffR + bixR];
-							for(int j = 1; j <= sLenR; j++)
-								if(col == offR + this._data[bOffR + bixR + j])
-									ag.increment(kl + krOff);
-						}
-					}
-				}
-			}
-		}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kl = 0; kl < NVL; kl++) {
+	// 		final int bOffL = lhs._ptr[kl];
+	// 		final int bLenL = lhs.len(kl);
+	// 		for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
+	// 			sLenL = lhs._data[bOffL + bixL];
+	// 			for(int i = 1; i <= sLenL; i++) {
+	// 				final int col = offL + lhs._data[bOffL + bixL + i];
+	// 				for(int kr = 0; kr < NVR; kr++) {
+	// 					final int bOffR = this._ptr[kr];
+	// 					final int bLenR = this.len(kr);
+	// 					final int krOff = kr * NVL;
+	// 					for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; bixR += sLenR + 1, offR += blksz) {
+	// 						sLenR = this._data[bOffR + bixR];
+	// 						for(int j = 1; j <= sLenR; j++)
+	// 							if(col == offR + this._data[bOffR + bixR + j])
+	// 								ag.increment(kl + krOff);
+	// 					}
+	// 				}
+	// 			}
+	// 		}
+	// 	}
+
+	// 	return ag;
+	// }
 
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -1209,9 +1172,4 @@ public class ColGroupOLE extends ColGroupOffset {
 	public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle that, Dictionary ret, boolean preModified) {
 		throw new NotImplementedException();
 	}
-
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 1c81fca..b7dd658 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -27,9 +27,6 @@ import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
-import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -68,55 +65,11 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// position and start offset arrays
-		int[] astart = new int[numVals];
-		int[] apos = skipScan(numVals, rl, astart);
-
-		double[] c = target.getDenseBlockValues();
-		// cache conscious append via horizontal scans
-		for(int bi = rl; bi < ru; bi += blksz) {
-			int bimax = Math.min(bi + blksz, ru);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				int start = astart[k];
-				for(; bix < blen & start < bimax; bix += 2) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					for(int i = Math.max(rl, start) - (rl - offT); i < Math.min(start + len, ru) - (rl - offT); i++) {
-
-						int rc = i * target.getNumColumns();
-						for(int j = 0; j < numCols; j++) {
-							double v = c[rc + _colIndexes[j]];
-							double nv = c[rc + _colIndexes[j]] + values[off + j];
-							if(v == 0.0 && nv != 0.0) {
-								target.setNonZeros(target.getNonZeros() + 1);
-							}
-							c[rc + _colIndexes[j]] = nv;
-
-						}
-					}
-					start += len;
-				}
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
 
 		// position and start offset arrays
 		int[] astart = new int[numVals];
@@ -150,172 +103,178 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colixTargets) {
-		// if(getNumValues() > 1) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// position and start offset arrays
-		int[] apos = new int[numVals];
-		int[] astart = new int[numVals];
-		int[] cix = new int[numCols];
-
-		// prepare target col indexes
-		for(int j = 0; j < numCols; j++)
-			cix[j] = colixTargets[_colIndexes[j]];
-
-		// cache conscious append via horizontal scans
-		for(int bi = 0; bi < _numRows; bi += blksz) {
-			int bimax = Math.min(bi + blksz, _numRows);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int start = astart[k];
-				for(; bix < blen & start < bimax; bix += 2) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					for(int i = start; i < start + len; i++)
-						for(int j = 0; j < numCols; j++)
-							if(values[off + j] != 0) {
-								double v = target.quickGetValue(i, _colIndexes[j]);
-								target.setValue(i, _colIndexes[j], values[off + j] + v);
-							}
-
-					start += len;
-				}
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-		// }
-		// else {
-		// // call generic decompression with decoder
-		// super.decompressToBlock(target, colixTargets);
-		// }
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-
-		// position and start offset arrays
-		int[] astart = new int[numVals];
-		int[] apos = allocIVector(numVals, true);
-
-		// cache conscious append via horizontal scans
-		int nnz = 0;
-		for(int bi = 0; bi < _numRows; bi += blksz) {
-			int bimax = Math.min(bi + blksz, _numRows);
-			// Arrays.fill(c, bi, bimax, 0);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int start = astart[k];
-				for(; bix < blen & start < bimax; bix += 2) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					for(int i = start; i < start + len; i++)
-						c[i] += values[off + colpos];
-					nnz += len;
-					start += len;
-				}
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-
-		// position and start offset arrays
-		int[] astart = new int[numVals];
-		int[] apos = allocIVector(numVals, true);
-
-		// cache conscious append via horizontal scans
-		int nnz = 0;
-		for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-			int bimax = Math.min(bi + blksz, ru);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int start = astart[k];
-				for(; bix < blen & start < bimax; bix += 2) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					if(start + len >= rl) {
-						int offsetStart = Math.max(start, rl);
-						for(int i = offsetStart; i < Math.min(start + len, bimax); i++)
-							c[i - rl] += values[off + colpos];
-						nnz += len - (offsetStart - start);
-					}
-					start += len;
-				}
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-		target.setNonZeros(nnz);
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock values) {
+		throw new NotImplementedException();
 	}
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colixTargets) {
+	// 	// if(getNumValues() > 1) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int numCols = getNumCols();
+	// 	final int numVals = getNumValues();
+	// 	final double[] values = getValues();
+
+	// 	// position and start offset arrays
+	// 	int[] apos = new int[numVals];
+	// 	int[] astart = new int[numVals];
+	// 	int[] cix = new int[numCols];
+
+	// 	// prepare target col indexes
+	// 	for(int j = 0; j < numCols; j++)
+	// 		cix[j] = colixTargets[_colIndexes[j]];
+
+	// 	// cache conscious append via horizontal scans
+	// 	for(int bi = 0; bi < _numRows; bi += blksz) {
+	// 		int bimax = Math.min(bi + blksz, _numRows);
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int start = astart[k];
+	// 			for(; bix < blen & start < bimax; bix += 2) {
+	// 				start += _data[boff + bix];
+	// 				int len = _data[boff + bix + 1];
+	// 				for(int i = start; i < start + len; i++)
+	// 					for(int j = 0; j < numCols; j++)
+	// 						if(values[off + j] != 0) {
+	// 							double v = target.quickGetValue(i, _colIndexes[j]);
+	// 							target.setValue(i, _colIndexes[j], values[off + j] + v);
+	// 						}
+
+	// 				start += len;
+	// 			}
+	// 			apos[k] = bix;
+	// 			astart[k] = start;
+	// 		}
+	// 	}
+	// 	// }
+	// 	// else {
+	// 	// // call generic decompression with decoder
+	// 	// super.decompressToBlock(target, colixTargets);
+	// 	// }
+	// }
 
-		// position and start offset arrays
-		int[] astart = new int[numVals];
-		int[] apos = allocIVector(numVals, true);
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int numCols = getNumCols();
+	// 	final int numVals = getNumValues();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	final double[] values = getValues();
+
+	// 	// position and start offset arrays
+	// 	int[] astart = new int[numVals];
+	// 	int[] apos = allocIVector(numVals, true);
+
+	// 	// cache conscious append via horizontal scans
+	// 	int nnz = 0;
+	// 	for(int bi = 0; bi < _numRows; bi += blksz) {
+	// 		int bimax = Math.min(bi + blksz, _numRows);
+	// 		// Arrays.fill(c, bi, bimax, 0);
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int start = astart[k];
+	// 			for(; bix < blen & start < bimax; bix += 2) {
+	// 				start += _data[boff + bix];
+	// 				int len = _data[boff + bix + 1];
+	// 				for(int i = start; i < start + len; i++)
+	// 					c[i] += values[off + colpos];
+	// 				nnz += len;
+	// 				start += len;
+	// 			}
+	// 			apos[k] = bix;
+	// 			astart[k] = start;
+	// 		}
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
 
-		// cache conscious append via horizontal scans
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int numCols = getNumCols();
+	// 	final int numVals = getNumValues();
+	// 	double[] c = target.getDenseBlockValues();
+	// 	final double[] values = getValues();
+
+	// 	// position and start offset arrays
+	// 	int[] astart = new int[numVals];
+	// 	int[] apos = allocIVector(numVals, true);
+
+	// 	// cache conscious append via horizontal scans
+	// 	int nnz = 0;
+	// 	for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
+	// 		int bimax = Math.min(bi + blksz, ru);
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int start = astart[k];
+	// 			for(; bix < blen & start < bimax; bix += 2) {
+	// 				start += _data[boff + bix];
+	// 				int len = _data[boff + bix + 1];
+	// 				if(start + len >= rl) {
+	// 					int offsetStart = Math.max(start, rl);
+	// 					for(int i = offsetStart; i < Math.min(start + len, bimax); i++)
+	// 						c[i - rl] += values[off + colpos];
+	// 					nnz += len - (offsetStart - start);
+	// 				}
+	// 				start += len;
+	// 			}
+	// 			apos[k] = bix;
+	// 			astart[k] = start;
+	// 		}
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
 
-		for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-			int bimax = Math.min(bi + blksz, ru);
-			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				if(bix >= blen)
-					continue;
-				int start = astart[k];
-				for(; bix < blen & start < bimax; bix += 2) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					if(start + len >= rl) {
-						int offsetStart = Math.max(start, rl);
-						for(int i = offsetStart; i < Math.min(start + len, bimax); i++)
-							c[i - rl] += values[off + colpos];
-					}
-					start += len;
-				}
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-	}
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	final int numCols = getNumCols();
+	// 	final int numVals = getNumValues();
+	// 	final double[] values = getValues();
+
+	// 	// position and start offset arrays
+	// 	int[] astart = new int[numVals];
+	// 	int[] apos = allocIVector(numVals, true);
+
+	// 	// cache conscious append via horizontal scans
+
+	// 	for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
+	// 		int bimax = Math.min(bi + blksz, ru);
+	// 		for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			if(bix >= blen)
+	// 				continue;
+	// 			int start = astart[k];
+	// 			for(; bix < blen & start < bimax; bix += 2) {
+	// 				start += _data[boff + bix];
+	// 				int len = _data[boff + bix + 1];
+	// 				if(start + len >= rl) {
+	// 					int offsetStart = Math.max(start, rl);
+	// 					for(int i = offsetStart; i < Math.min(start + len, bimax); i++)
+	// 						c[i - rl] += values[off + colpos];
+	// 				}
+	// 				start += len;
+	// 			}
+	// 			apos[k] = bix;
+	// 			astart[k] = start;
+	// 		}
+	// 	}
+	// }
 
 	@Override
 	public int[] getCounts(int[] counts) {
@@ -658,7 +617,7 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	protected final void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 
 		final int numVals = getNumValues();
 
@@ -903,46 +862,51 @@ public class ColGroupRLE extends ColGroupOffset {
 		return new Pair<>(apos, astart);
 	}
 
-	@Override
-	public double[] preAggregate(double[] a, int row) {
-		final int numVals = getNumValues();
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		// current pos per OLs / output values
-		int[] astart = new int[numVals];
-		int[] apos = allocIVector(numVals, true);
-		double[] cvals = allocDVector(numVals, true);
-		int off = row * _numRows;
-
-		// step 2: cache conscious matrix-vector via horizontal scans
-		for(int ai = 0; ai < _numRows; ai += blksz) {
-			int aimax = Math.min(ai + blksz, _numRows);
-
-			// horizontal scan, incl pos maintenance
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int bix = apos[k];
-				int start = astart[k];
-
-				// compute partial results, not aligned
-				while(bix < blen & start < aimax) {
-					start += _data[boff + bix];
-					int len = _data[boff + bix + 1];
-					cvals[k] += LinearAlgebraUtils.vectSum(a, start + off, len);
-					start += len;
-					bix += 2;
-				}
-
-				apos[k] = bix;
-				astart[k] = start;
-			}
-		}
-		return cvals;
-	}
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// 	final int numVals = getNumValues();
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	// current pos per OLs / output values
+	// 	int[] astart = new int[numVals];
+	// 	int[] apos = allocIVector(numVals, true);
+	// 	double[] cvals = allocDVector(numVals, true);
+	// 	int off = row * _numRows;
+
+	// 	// step 2: cache conscious matrix-vector via horizontal scans
+	// 	for(int ai = 0; ai < _numRows; ai += blksz) {
+	// 		int aimax = Math.min(ai + blksz, _numRows);
+
+	// 		// horizontal scan, incl pos maintenance
+	// 		for(int k = 0; k < numVals; k++) {
+	// 			int boff = _ptr[k];
+	// 			int blen = len(k);
+	// 			int bix = apos[k];
+	// 			int start = astart[k];
+
+	// 			// compute partial results, not aligned
+	// 			while(bix < blen & start < aimax) {
+	// 				start += _data[boff + bix];
+	// 				int len = _data[boff + bix + 1];
+	// 				cvals[k] += LinearAlgebraUtils.vectSum(a, start + off, len);
+	// 				start += len;
+	// 				bix += 2;
+	// 			}
+
+	// 			apos[k] = bix;
+	// 			astart[k] = start;
+	// 		}
+	// 	}
+	// 	return cvals;
+	// }
 
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// 	return null;
+	// }
+	
 	@Override
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
-		return null;
+	public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru){
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -1053,93 +1017,93 @@ public class ColGroupRLE extends ColGroupOffset {
 		return ret;
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kr = 0; kr < NVR; kr++) {
-			final int boffL = _ptr[kr];
-			final int blenL = len(kr);
-			final int offKr = kr * NVL;
-			for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
-				startL += _data[boffL + bixL];
-				lenL = _data[boffL + bixL + 1];
-				final int endL = startL + lenL;
-				for(int i = startL; i < endL; i++)
-					ag.increment(lhs._data.getIndex(i) + offKr);
-
-			}
-		}
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kr = 0; kr < NVR; kr++) {
+	// 		final int boffL = _ptr[kr];
+	// 		final int blenL = len(kr);
+	// 		final int offKr = kr * NVL;
+	// 		for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
+	// 			startL += _data[boffL + bixL];
+	// 			lenL = _data[boffL + bixL + 1];
+	// 			final int endL = startL + lenL;
+	// 			for(int i = startL; i < endL; i++)
+	// 				ag.increment(lhs._data.getIndex(i) + offKr);
+
+	// 		}
+	// 	}
+	// 	return ag;
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kl = 0; kl < NVL; kl++) {
-			final int boffL = lhs._ptr[kl];
-			final int blenL = lhs.len(kl);
-			for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
-				startL += lhs._data[boffL + bixL];
-				lenL = lhs._data[boffL + bixL + 1];
-				final int endL = startL + lenL;
-				for(int kr = 0; kr < NVR; kr++) {
-					final int boffR = _ptr[kr];
-					final int blenR = len(kr);
-					final int krOff = kr * NVL;
-					for(int bixR = 0, startR = 0, lenR = 0; bixR < blenR & startR < endL; startR += lenR, bixR += 2) {
-						startR += _data[boffR + bixR];
-						lenR = _data[boffR + bixR + 1];
-						final int endR = startR + lenR;
-						if(startL < endR && startR < endL) {
-							final int endOverlap = Math.min(endR, endL);
-							final int startOverlap = Math.max(startL, startR);
-							final int lenOverlap = endOverlap - startOverlap;
-							ag.increment(kl + krOff, lenOverlap);
-						}
-					}
-				}
-			}
-		}
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	for(int kl = 0; kl < NVL; kl++) {
+	// 		final int boffL = lhs._ptr[kl];
+	// 		final int blenL = lhs.len(kl);
+	// 		for(int bixL = 0, startL = 0, lenL = 0; bixL < blenL && startL < _numRows; startL += lenL, bixL += 2) {
+	// 			startL += lhs._data[boffL + bixL];
+	// 			lenL = lhs._data[boffL + bixL + 1];
+	// 			final int endL = startL + lenL;
+	// 			for(int kr = 0; kr < NVR; kr++) {
+	// 				final int boffR = _ptr[kr];
+	// 				final int blenR = len(kr);
+	// 				final int krOff = kr * NVL;
+	// 				for(int bixR = 0, startR = 0, lenR = 0; bixR < blenR & startR < endL; startR += lenR, bixR += 2) {
+	// 					startR += _data[boffR + bixR];
+	// 					lenR = _data[boffR + bixR + 1];
+	// 					final int endR = startR + lenR;
+	// 					if(startL < endR && startR < endL) {
+	// 						final int endOverlap = Math.min(endR, endL);
+	// 						final int startOverlap = Math.max(startL, startR);
+	// 						final int lenOverlap = endOverlap - startOverlap;
+	// 						ag.increment(kl + krOff, lenOverlap);
+	// 					}
+	// 				}
+	// 			}
+	// 		}
+	// 	}
+	// 	return ag;
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -1162,12 +1126,8 @@ public class ColGroupRLE extends ColGroupOffset {
 	}
 
 	@Override
-	public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle that, Dictionary ret, boolean preModified){
+	public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle that, Dictionary ret, boolean preModified) {
 		throw new NotImplementedException();
 	}
 
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index 769b2fe..f56d6db 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -25,7 +25,6 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
@@ -33,8 +32,6 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -97,24 +94,16 @@ public class ColGroupSDC extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(getNumberNonZeros());
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
-
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int nCol = _colIndexes.length;
 		final int tCol = target.getNumColumns();
-		final double[] values = getValues();
 		final int offsetToDefault = values.length - nCol;
 
 		double[] c = target.getDenseBlockValues();
 		offT = offT * tCol;
 		int i = rl;
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		AIterator it = _indexes.getIterator(rl);
 		for(; i < ru && it.hasNext(); i++, offT += tCol) {
 			if(it.value() == i) {
 				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
@@ -133,57 +122,97 @@ public class ColGroupSDC extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		throw new NotImplementedException("Not Implemented");
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colPos) {
-		final double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-		final double defaultVal = values[values.length - _colIndexes.length + colPos];
-		int i = 0;
-		final AIterator it = _indexes.getIterator();
-		for(; i < _numRows && it.hasNext(); i++) {
-			if(it.value() == i)
-				c[i] += values[_data.getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colPos];
-			else
-				c[i] += defaultVal;
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock sb) {
+		final int tCol = target.getNumColumns();
+		final int offsetToDefault = sb.numRows() - 1;
+		if(sb.isEmpty(offsetToDefault)) {
+			throw new NotImplementedException("Implement a SDCZeros decompress if this is the case");
 		}
-		for(; i < _numRows; i++)
-			c[i] += defaultVal;
-
-		target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
-	}
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
-
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final int nCol = _colIndexes.length;
-		final double[] values = getValues();
-		final int offsetToDefault = values.length - nCol + colpos;
-		final AIterator it = _indexes.getIterator();
+		final int defApos = sb.pos(offsetToDefault);
+		final int defAlen = sb.size(offsetToDefault) + defApos;
+		final double[] defAvals = sb.values(offsetToDefault);
+		final int[] defAix = sb.indexes(offsetToDefault);
 
-		int offT = 0;
+		double[] c = target.getDenseBlockValues();
+		offT = offT * tCol;
 		int i = rl;
-		it.skipTo(rl);
-
-		for(; i < ru && it.hasNext(); i++, offT++) {
+		AIterator it = _indexes.getIterator(rl);
+		for(; i < ru && it.hasNext(); i++, offT += tCol) {
 			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				c[offT] += values[offset + colpos];
+				int dictIndex = _data.getIndex(it.getDataIndexAndIncrement());
+				if(sb.isEmpty(dictIndex))
+					continue;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[offT + _colIndexes[aix[j]]] += avals[j];
 			}
 			else
-				c[offT] += values[offsetToDefault];
+				for(int j = defApos; j < defAlen; j++)
+					c[offT + _colIndexes[defAix[j]]] += defAvals[j];
 		}
 
-		for(; i < ru; i++, offT++)
-			c[offT] += values[offsetToDefault];
-	}
+		for(; i < ru; i++, offT += tCol)
+			for(int j = defApos; j < defAlen; j++)
+				c[offT + _colIndexes[defAix[j]]] += defAvals[j];
+
+	}
+
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colPos) {
+	// 	final double[] c = target.getDenseBlockValues();
+	// 	final double[] values = getValues();
+	// 	final double defaultVal = values[values.length - _colIndexes.length + colPos];
+	// 	int i = 0;
+	// 	final AIterator it = _indexes.getIterator();
+	// 	for(; i < _numRows && it.hasNext(); i++) {
+	// 		if(it.value() == i)
+	// 			c[i] += values[_data.getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colPos];
+	// 		else
+	// 			c[i] += defaultVal;
+	// 	}
+	// 	for(; i < _numRows; i++)
+	// 		c[i] += defaultVal;
+
+	// 	target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	final int nCol = _colIndexes.length;
+	// 	final double[] values = getValues();
+	// 	final int offsetToDefault = values.length - nCol + colpos;
+	// 	final AIterator it = _indexes.getIterator(rl);
+
+	// 	int offT = 0;
+	// 	int i = rl;
+
+	// 	for(; i < ru && it.hasNext(); i++, offT++) {
+	// 		if(it.value() == i) {
+	// 			int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
+	// 			c[offT] += values[offset + colpos];
+	// 		}
+	// 		else
+	// 			c[offT] += values[offsetToDefault];
+	// 	}
+
+	// 	for(; i < ru; i++, offT++)
+	// 		c[offT] += values[offsetToDefault];
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -192,13 +221,13 @@ public class ColGroupSDC extends ColGroupValue {
 		if(ix < 0)
 			throw new RuntimeException("Column index " + c + " not in group.");
 
-		// // get value
-		AIterator it = _indexes.getIterator();
-		it.skipTo(r);
+		// get value
+		AIterator it = _indexes.getIterator(r);
+		final int nCol = _colIndexes.length;
 		if(it.value() == r)
-			return _dict.getValue(_data.getIndex(it.getDataIndex()) * _colIndexes.length + ix);
+			return _dict.getValue(_data.getIndex(it.getDataIndex()) * nCol + ix);
 		else
-			return _dict.getValue(getNumValues() * _colIndexes.length - _colIndexes.length + ix);
+			return _dict.getValue(getNumValues() * nCol - nCol + ix);
 	}
 
 	@Override
@@ -207,14 +236,13 @@ public class ColGroupSDC extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		final int numVals = getNumValues();
 		// // pre-aggregate nnz per value tuple
 		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
 
 		int rix = rl;
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		AIterator it = _indexes.getIterator(rl);
 		for(; rix < ru && it.hasNext(); rix++) {
 			if(it.value() != rix)
 				c[rix] += vals[numVals - 1];
@@ -235,8 +263,7 @@ public class ColGroupSDC extends ColGroupValue {
 
 		double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
 
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		AIterator it = _indexes.getIterator(rl);
 
 		int rix = rl;
 		for(; rix < ru && it.hasNext(); rix++) {
@@ -262,8 +289,7 @@ public class ColGroupSDC extends ColGroupValue {
 		final int def = counts.length - 1;
 
 		int i = rl;
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		AIterator it = _indexes.getIterator(rl);
 
 		for(; i < ru && it.hasNext(); i++) {
 			if(i == it.value())
@@ -282,52 +308,92 @@ public class ColGroupSDC extends ColGroupValue {
 		return _data.getIndex(r);
 	}
 
-	public double[] preAggregate(double[] a, int row) {
-		final int numVals = getNumValues();
-		final double[] vals = allocDVector(numVals, true);
-		final AIterator it = _indexes.getIterator();
-		final int def = numVals - 1;
-
-		int i = 0;
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// final int numVals = getNumValues();
+	// final double[] vals = allocDVector(numVals, true);
+	// final AIterator it = _indexes.getIterator();
+	// final int def = numVals - 1;
+
+	// int i = 0;
+
+	// if(row > 0) {
+	// int offA = _numRows * row;
+	// for(; i < _numRows && it.hasNext(); i++, offA++)
+	// if(it.value() == i)
+	// vals[_data.getIndex(it.getDataIndexAndIncrement())] += a[offA];
+	// else
+	// vals[def] += a[offA];
+	// for(; i < _numRows; i++, offA++)
+	// vals[def] += a[offA];
+	// }
+	// else {
+	// for(; i < _numRows && it.hasNext(); i++)
+	// if(it.value() == i)
+	// vals[_data.getIndex(it.getDataIndexAndIncrement())] += a[i];
+	// else
+	// vals[def] += a[i];
+	// for(; i < _numRows; i++)
+	// vals[def] += a[i];
+	// }
+	// return vals;
+	// }
+
+	@Override
+	public void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
+		else
+			preAggregateDense(m, preAgg, rl, ru);
+	}
 
-		if(row > 0) {
-			int offA = _numRows * row;
-			for(; i < _numRows && it.hasNext(); i++, offA++)
-				if(it.value() == i)
-					vals[_data.getIndex(it.getDataIndexAndIncrement())] += a[offA];
-				else
-					vals[def] += a[offA];
-			for(; i < _numRows; i++, offA++)
-				vals[def] += a[offA];
-		}
-		else {
-			for(; i < _numRows && it.hasNext(); i++)
-				if(it.value() == i)
-					vals[_data.getIndex(it.getDataIndexAndIncrement())] += a[i];
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			final int def = offOut + numVals - 1;
+			final AIterator it = _indexes.getIterator();
+			int rc = 0;
+			int offLeft = rowLeft * _numRows;
+			for(; rc < _numRows && it.hasNext(); rc++, offLeft++) {
+				if(it.value() == rc)
+					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += mV[offLeft];
 				else
-					vals[def] += a[i];
-			for(; i < _numRows; i++)
-				vals[def] += a[i];
+					preAV[def] += mV[offLeft];
+			}
+
+			for(; rc < _numRows; rc++, offLeft++) {
+				preAV[def] += mV[offLeft];
+			}
 		}
-		return vals;
 	}
 
-	@Override
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
 		final int numVals = getNumValues();
-		final double[] vals = allocDVector(numVals, true);
-		final int[] indexes = sb.indexes(row);
-		final double[] sparseV = sb.values(row);
-		final AIterator it = _indexes.getIterator();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final AIterator it = _indexes.getIterator();
+			final int def = offOut + numVals - 1;
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final int[] aix = sb.indexes(rowLeft);
+			final double[] avals = sb.values(rowLeft);
+			int j = apos;
+			for(; j < alen && it.hasNext(); j++) {
+				it.skipTo(aix[j]);
+				if(it.value() == aix[j])
+					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += avals[j];
+				else
+					preAV[def] += avals[j];
+			}
 
-		for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++) {
-			it.skipTo(indexes[i]);
-			if(it.value() == indexes[i])
-				vals[getIndex(it.getDataIndexAndIncrement())] += sparseV[i];
-			else
-				vals[numVals - 1] += sparseV[i];
+			for(; j < alen; j++) {
+				preAV[def] += avals[j];
+			}
 		}
-		return vals;
 	}
 
 	@Override
@@ -360,7 +426,7 @@ public class ColGroupSDC extends ColGroupValue {
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
 		_indexes = OffsetFactory.readIn(in);
-		_data = MapToFactory.readIn(in, getNumValues());
+		_data = MapToFactory.readIn(in);
 	}
 
 	@Override
@@ -394,214 +460,214 @@ public class ColGroupSDC extends ColGroupValue {
 		return sb.toString();
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int nCol = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator it = _indexes.getIterator();
-		final int offsetToDefault = this.getNumValues() - 1;
-
-		int i = 0;
-
-		int row;
-		for(; i < this._numRows && it.hasNext(); i++) {
-			int col = lhs._data.getIndex(i);
-			if(it.value() == i)
-				row = getIndex(it.getDataIndexAndIncrement());
-			else
-				row = offsetToDefault;
-			ag.increment(col + row * nCol);
-		}
-		row = offsetToDefault;
-		for(; i < this._numRows; i++) {
-			int col = lhs._data.getIndex(i);
-			ag.increment(col + row * nCol);
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		final int lhsNV = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = lhsNV * rhsNV;
-		final int nCol = lhs.getNumValues();
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		final int defL = lhsNV - 1;
-		final int defR = rhsNV - 1;
-
-		AIterator lIt = lhs._indexes.getIterator();
-		AIterator rIt = _indexes.getIterator();
-
-		int i = 0;
-		int col;
-		int row;
-		for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
-			if(lIt.value() == i)
-				col = lhs.getIndex(lIt.getDataIndexAndIncrement());
-			else
-				col = defL;
-			if(rIt.value() == i)
-				row = this.getIndex(rIt.getDataIndexAndIncrement());
-			else
-				row = defR;
-			ag.increment(col + row * nCol);
-		}
-
-		if(lIt.hasNext()) {
-			row = defR;
-			for(; i < this._numRows && lIt.hasNext(); i++) {
-				if(lIt.value() == i)
-					col = lhs.getIndex(lIt.getDataIndexAndIncrement());
-				else
-					col = defL;
-
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		if(rIt.hasNext()) {
-			col = defL;
-			for(; i < this._numRows && rIt.hasNext(); i++) {
-				if(rIt.value() == i)
-					row = this.getIndex(rIt.getDataIndexAndIncrement());
-				else
-					row = defR;
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		ag.increment(defL + defR * nCol, this._numRows - i);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		final int lhsNV = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = lhsNV * rhsNV;
-		final int nCol = lhs.getNumValues();
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final int defR = rhsNV - 1;
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		int i = 0;
-		int col;
-		int row;
-		for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
-			if(lIt.value() == i) {
-				col = 1;
-				lIt.next();
-			}
-			else
-				col = 0;
-			if(rIt.value() == i)
-				row = this.getIndex(rIt.getDataIndexAndIncrement());
-			else
-				row = defR;
-			ag.increment(col + row * nCol);
-		}
-
-		if(lIt.hasNext()) {
-			row = defR;
-			for(; i < this._numRows && lIt.hasNext(); i++) {
-				if(lIt.value() == i) {
-					col = 1;
-					lIt.next();
-				}
-				else
-					col = 0;
-
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		if(rIt.hasNext()) {
-			for(; i < this._numRows && rIt.hasNext(); i++) {
-				if(rIt.value() == i)
-					row = this.getIndex(rIt.getDataIndexAndIncrement());
-				else
-					row = defR;
-				ag.increment(row * nCol);
-			}
-		}
-
-		ag.increment(defR * nCol, this._numRows - i);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int defR = (rhsNV - 1) * nCol;
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value())
-				ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) +
-					this.getIndex(rIt.getDataIndexAndIncrement()) * nCol);
-			else if(lIt.value() > rIt.value())
-				rIt.next();
-			else
-				ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + defR);
-
-		while(lIt.hasNext())
-			ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + defR);
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		final int defR = (NVR - 1) * NVL;
-
-		for(int kl = 0; kl < NVL; kl++) {
-			AIterator it = _indexes.getIterator();
-			final int bOffL = lhs._ptr[kl];
-			final int bLenL = lhs.len(kl);
-			for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
-				sLenL = lhs._data[bOffL + bixL];
-				for(int i = 1; i <= sLenL; i++) {
-					final int col = offL + lhs._data[bOffL + bixL + i];
-					it.skipTo(col);
-					if(it.value() == col)
-						ag.increment(kl + this.getIndex(it.getDataIndexAndIncrement()) * NVL);
-					else
-						ag.increment(kl + defR);
-
-				}
-			}
-		}
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator it = _indexes.getIterator();
+	// 	final int offsetToDefault = this.getNumValues() - 1;
+
+	// 	int i = 0;
+
+	// 	int row;
+	// 	for(; i < this._numRows && it.hasNext(); i++) {
+	// 		int col = lhs._data.getIndex(i);
+	// 		if(it.value() == i)
+	// 			row = getIndex(it.getDataIndexAndIncrement());
+	// 		else
+	// 			row = offsetToDefault;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+	// 	row = offsetToDefault;
+	// 	for(; i < this._numRows; i++) {
+	// 		int col = lhs._data.getIndex(i);
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	final int lhsNV = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = lhsNV * rhsNV;
+	// 	final int nCol = lhs.getNumValues();
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	final int defL = lhsNV - 1;
+	// 	final int defR = rhsNV - 1;
+
+	// 	AIterator lIt = lhs._indexes.getIterator();
+	// 	AIterator rIt = _indexes.getIterator();
+
+	// 	int i = 0;
+	// 	int col;
+	// 	int row;
+	// 	for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
+	// 		if(lIt.value() == i)
+	// 			col = lhs.getIndex(lIt.getDataIndexAndIncrement());
+	// 		else
+	// 			col = defL;
+	// 		if(rIt.value() == i)
+	// 			row = this.getIndex(rIt.getDataIndexAndIncrement());
+	// 		else
+	// 			row = defR;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	if(lIt.hasNext()) {
+	// 		row = defR;
+	// 		for(; i < this._numRows && lIt.hasNext(); i++) {
+	// 			if(lIt.value() == i)
+	// 				col = lhs.getIndex(lIt.getDataIndexAndIncrement());
+	// 			else
+	// 				col = defL;
+
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	if(rIt.hasNext()) {
+	// 		col = defL;
+	// 		for(; i < this._numRows && rIt.hasNext(); i++) {
+	// 			if(rIt.value() == i)
+	// 				row = this.getIndex(rIt.getDataIndexAndIncrement());
+	// 			else
+	// 				row = defR;
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	ag.increment(defL + defR * nCol, this._numRows - i);
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	final int lhsNV = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = lhsNV * rhsNV;
+	// 	final int nCol = lhs.getNumValues();
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final int defR = rhsNV - 1;
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = _indexes.getIterator();
+
+	// 	int i = 0;
+	// 	int col;
+	// 	int row;
+	// 	for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
+	// 		if(lIt.value() == i) {
+	// 			col = 1;
+	// 			lIt.next();
+	// 		}
+	// 		else
+	// 			col = 0;
+	// 		if(rIt.value() == i)
+	// 			row = this.getIndex(rIt.getDataIndexAndIncrement());
+	// 		else
+	// 			row = defR;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	if(lIt.hasNext()) {
+	// 		row = defR;
+	// 		for(; i < this._numRows && lIt.hasNext(); i++) {
+	// 			if(lIt.value() == i) {
+	// 				col = 1;
+	// 				lIt.next();
+	// 			}
+	// 			else
+	// 				col = 0;
+
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	if(rIt.hasNext()) {
+	// 		for(; i < this._numRows && rIt.hasNext(); i++) {
+	// 			if(rIt.value() == i)
+	// 				row = this.getIndex(rIt.getDataIndexAndIncrement());
+	// 			else
+	// 				row = defR;
+	// 			ag.increment(row * nCol);
+	// 		}
+	// 	}
+
+	// 	ag.increment(defR * nCol, this._numRows - i);
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int defR = (rhsNV - 1) * nCol;
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = _indexes.getIterator();
+
+	// 	while(lIt.hasNext() && rIt.hasNext())
+	// 		if(lIt.value() == rIt.value())
+	// 			ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) +
+	// 				this.getIndex(rIt.getDataIndexAndIncrement()) * nCol);
+	// 		else if(lIt.value() > rIt.value())
+	// 			rIt.next();
+	// 		else
+	// 			ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + defR);
+
+	// 	while(lIt.hasNext())
+	// 		ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()) + defR);
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	final int NVR = this.getNumValues();
+	// 	final int NVL = lhs.getNumValues();
+	// 	final int retSize = NVR * NVL;
+	// 	final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// 	final int defR = (NVR - 1) * NVL;
+
+	// 	for(int kl = 0; kl < NVL; kl++) {
+	// 		AIterator it = _indexes.getIterator();
+	// 		final int bOffL = lhs._ptr[kl];
+	// 		final int bLenL = lhs.len(kl);
+	// 		for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
+	// 			sLenL = lhs._data[bOffL + bixL];
+	// 			for(int i = 1; i <= sLenL; i++) {
+	// 				final int col = offL + lhs._data[bOffL + bixL + i];
+	// 				it.skipTo(col);
+	// 				if(it.value() == col)
+	// 					ag.increment(kl + this.getIndex(it.getDataIndexAndIncrement()) * NVL);
+	// 				else
+	// 					ag.increment(kl + defR);
+
+	// 			}
+	// 		}
+	// 	}
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -746,9 +812,4 @@ public class ColGroupSDC extends ColGroupValue {
 		return ret;
 	}
 
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
-
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index 493afac..d749ea5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -30,8 +30,6 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -86,17 +84,10 @@ public class ColGroupSDCSingle extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(_numRows * _colIndexes.length + target.getNonZeros());
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
-
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int nCol = _colIndexes.length;
 		final int tCol = target.getNumColumns();
-		final double[] values = getValues();
 		final int offsetToDefault = values.length - nCol;
 
 		double[] c = target.getDenseBlockValues();
@@ -121,62 +112,68 @@ public class ColGroupSDCSingle extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		throw new NotImplementedException("Not Implemented");
-	}
-
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		final double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-		final int offsetToDefault = _colIndexes.length;
-		final AIterator it = _indexes.getIterator();
-		final double v1 = values[offsetToDefault + colpos];
-		final double v2 = values[colpos];
-
-		int i = 0;
-		for(; i < _numRows && it.hasNext(); i++) {
-			if(it.value() == i) {
-				c[i] += v1;
-				it.next();
-			}
-			else
-				c[i] += v2;
-		}
-		for(; i < _numRows; i++)
-			c[i] += v2;
-
-		target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock values) {
+		throw new NotImplementedException();
 	}
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
-
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final int nCol = _colIndexes.length;
-		final double[] values = getValues();
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator();
-
-		int offT = 0;
-		int i = rl;
-		it.skipTo(rl);
-
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			if(it.value() == i) {
-				it.next();
-				c[offT] += values[colpos];
-			}
-			else
-				c[offT] += values[offsetToDefault + colpos];
-		}
-
-		for(; i < ru; i++, offT++)
-			c[offT] += values[offsetToDefault + colpos];
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	final double[] c = target.getDenseBlockValues();
+	// 	final double[] values = getValues();
+	// 	final int offsetToDefault = _colIndexes.length;
+	// 	final AIterator it = _indexes.getIterator();
+	// 	final double v1 = values[offsetToDefault + colpos];
+	// 	final double v2 = values[colpos];
+
+	// 	int i = 0;
+	// 	for(; i < _numRows && it.hasNext(); i++) {
+	// 		if(it.value() == i) {
+	// 			c[i] += v1;
+	// 			it.next();
+	// 		}
+	// 		else
+	// 			c[i] += v2;
+	// 	}
+	// 	for(; i < _numRows; i++)
+	// 		c[i] += v2;
+
+	// 	target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	final int nCol = _colIndexes.length;
+	// 	final double[] values = getValues();
+	// 	final int offsetToDefault = values.length - nCol;
+	// 	final AIterator it = _indexes.getIterator();
+
+	// 	int offT = 0;
+	// 	int i = rl;
+	// 	it.skipTo(rl);
+
+	// 	for(; i < ru && it.hasNext(); i++, offT++) {
+	// 		if(it.value() == i) {
+	// 			it.next();
+	// 			c[offT] += values[colpos];
+	// 		}
+	// 		else
+	// 			c[offT] += values[offsetToDefault + colpos];
+	// 	}
+
+	// 	for(; i < ru; i++, offT++)
+	// 		c[offT] += values[offsetToDefault + colpos];
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -185,12 +182,11 @@ public class ColGroupSDCSingle extends ColGroupValue {
 		if(ix < 0)
 			throw new RuntimeException("Column index " + c + " not in group.");
 
-		AIterator it = _indexes.getIterator();
-		it.skipTo(r);
+		AIterator it = _indexes.getIterator(r);
 		if(it.value() == r)
-			return _dict.getValue(ix + c);
+			return _dict.getValue(ix);
 		else
-			return _dict.getValue(_colIndexes.length + c);
+			return _dict.getValue(_colIndexes.length + ix);
 
 	}
 
@@ -200,7 +196,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 
 		// // pre-aggregate nnz per value tuple
 		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
@@ -265,57 +261,115 @@ public class ColGroupSDCSingle extends ColGroupValue {
 		return counts;
 	}
 
-	public double[] preAggregate(double[] a, int row) {
-		final int numVals = getNumValues();
-		final double[] vals = allocDVector(numVals, true);
-		final AIterator it = _indexes.getIterator();
-
-		int i = 0;
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// final int numVals = getNumValues();
+	// final double[] vals = allocDVector(numVals, true);
+	// final AIterator it = _indexes.getIterator();
+
+	// int i = 0;
+
+	// if(row > 0) {
+	// int offA = _numRows * row;
+	// for(; i < _numRows && it.hasNext(); i++, offA++)
+	// if(it.value() == i) {
+	// it.next();
+	// vals[0] += a[offA];
+	// }
+	// else
+	// vals[1] += a[offA];
+	// for(; i < _numRows; i++, offA++)
+	// vals[1] += a[offA];
+	// }
+	// else {
+	// for(; i < _numRows && it.hasNext(); i++)
+	// if(it.value() == i) {
+	// it.next();
+	// vals[0] += a[i];
+	// }
+	// else
+	// vals[1] += a[i];
+	// for(; i < _numRows; i++)
+	// vals[1] += a[i];
+	// }
+
+	// return vals;
+	// }
+
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// final int numVals = getNumValues();
+	// final double[] vals = allocDVector(numVals, true);
+	// final int[] indexes = sb.indexes(row);
+	// final double[] sparseV = sb.values(row);
+	// final AIterator it = _indexes.getIterator();
+
+	// for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++) {
+	// it.skipTo(indexes[i]);
+	// if(it.value() == indexes[i]) {
+	// vals[0] += sparseV[i];
+	// it.next();
+	// }
+	// else
+	// vals[1] += sparseV[i];
+	// }
+	// return vals;
+	// }
+
+	@Override
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
+		else
+			preAggregateDense(m, preAgg, rl, ru);
+	}
 
-		if(row > 0) {
-			int offA = _numRows * row;
-			for(; i < _numRows && it.hasNext(); i++, offA++)
-				if(it.value() == i){
-					it.next();
-					vals[0] += a[offA];
-				}
-				else
-					vals[1] += a[offA];
-			for(; i < _numRows; i++, offA++)
-				vals[1] += a[offA];
-		}
-		else {
-			for(; i < _numRows && it.hasNext(); i++)
-				if(it.value() == i){
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			final AIterator it = _indexes.getIterator();
+			final int def = offOut + 1;
+			int rc = 0;
+			int offLeft = rowLeft * _numRows;
+			for(; rc < _numRows; rc++, offLeft++) {
+				if(it.value() == rc) {
+					preAV[offOut] += mV[offLeft];
 					it.next();
-					vals[0] += a[i];
 				}
 				else
-					vals[1] += a[i];
-			for(; i < _numRows; i++)
-				vals[1] += a[i];
+					preAV[def] += mV[offLeft];
+			}
+			for(; rc < _numRows; rc++, offLeft++) {
+				preAV[def] += mV[offLeft];
+			}
 		}
-
-		return vals;
 	}
 
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
 		final int numVals = getNumValues();
-		final double[] vals = allocDVector(numVals, true);
-		final int[] indexes = sb.indexes(row);
-		final double[] sparseV = sb.values(row);
-		final AIterator it = _indexes.getIterator();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final AIterator it = _indexes.getIterator();
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final int[] aix = sb.indexes(rowLeft);
+			final double[] avals = sb.values(rowLeft);
+			final int def = offOut + 1;
+			for(int j = apos; j < alen; j++) {
+				it.skipTo(aix[j]);
+				if(it.value() == aix[j]) {
+					preAV[offOut] += avals[j];
+					it.next();
+				}
+				else
+					preAV[def] += avals[j];
 
-		for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++) {
-			it.skipTo(indexes[i]);
-			if(it.value() == indexes[i]) {
-				vals[0] += sparseV[i];
-				it.next();
 			}
-			else
-				vals[1] += sparseV[i];
 		}
-		return vals;
 	}
 
 	@Override
@@ -374,181 +428,181 @@ public class ColGroupSDCSingle extends ColGroupValue {
 		return sb.toString();
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator it = _indexes.getIterator();
-
-		int i = 0;
-
-		int row;
-		for(; i < this._numRows && it.hasNext(); i++) {
-			int col = lhs._data.getIndex(i);
-			if(it.value() == i) {
-				row = 0;
-				it.next();
-			}
-			else
-				row = 1;
-
-			if(col < lhs.getNumValues())
-				ag.increment(col + row * nCol);
-		}
-		row = 0;
-		for(; i < this._numRows; i++) {
-			int col = lhs._data.getIndex(i);
-			if(col < lhs.getNumValues())
-				ag.increment(col + row * nCol);
-		}
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		final int lhsNV = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = lhsNV * rhsNV;
-		final int nCol = lhs.getNumValues();
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final int defL = lhsNV - 1;
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		int i = 0;
-		int col;
-		int row;
-		for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
-			if(lIt.value() == i)
-				col = lhs.getIndex(lIt.getDataIndexAndIncrement());
-			else
-				col = defL;
-			if(rIt.value() == i) {
-				row = 0;
-				rIt.next();
-			}
-			else
-				row = 1;
-			ag.increment(col + row * nCol);
-		}
-
-		if(lIt.hasNext()) {
-			row = 0;
-			for(; i < this._numRows && lIt.hasNext(); i++) {
-				if(lIt.value() == i)
-					col = lhs.getIndex(lIt.getDataIndexAndIncrement());
-				else
-					col = defL;
-
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		if(rIt.hasNext()) {
-			col = defL;
-			for(; i < this._numRows && rIt.hasNext(); i++) {
-				if(rIt.value() == i) {
-					row = 0;
-					rIt.next();
-				}
-				else
-					row = 1;
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		ag.increment(defL, this._numRows - i);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		final int lhsNV = lhs.getNumValues();
-		final int rhsNV = this.getNumValues();
-		final int retSize = lhsNV * rhsNV;
-		final int nCol = lhs.getNumValues();
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		;
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		int i = 0;
-		int col;
-		int row;
-		for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
-			if(lIt.value() == i) {
-				col = 1;
-				lIt.next();
-			}
-			else
-				col = 0;
-			if(rIt.value() == i) {
-				row = 1;
-				rIt.next();
-			}
-			else
-				row = 0;
-			ag.increment(col + row * nCol);
-		}
-
-		if(lIt.hasNext()) {
-			row = 1;
-			for(; i < _numRows && lIt.hasNext(); i++) {
-				if(lIt.value() == i) {
-					col = 1;
-					lIt.next();
-				}
-				else
-					col = 0;
-
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		if(rIt.hasNext()) {
-			col = 1;
-			for(; i < _numRows && rIt.hasNext(); i++) {
-				if(rIt.value() == i) {
-					row = 1;
-					rIt.next();
-				}
-				else
-					row = 0;
-				ag.increment(col + row * nCol);
-			}
-		}
-
-		ag.increment(0, _numRows - i);
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator it = _indexes.getIterator();
+
+	// 	int i = 0;
+
+	// 	int row;
+	// 	for(; i < this._numRows && it.hasNext(); i++) {
+	// 		int col = lhs._data.getIndex(i);
+	// 		if(it.value() == i) {
+	// 			row = 0;
+	// 			it.next();
+	// 		}
+	// 		else
+	// 			row = 1;
+
+	// 		if(col < lhs.getNumValues())
+	// 			ag.increment(col + row * nCol);
+	// 	}
+	// 	row = 0;
+	// 	for(; i < this._numRows; i++) {
+	// 		int col = lhs._data.getIndex(i);
+	// 		if(col < lhs.getNumValues())
+	// 			ag.increment(col + row * nCol);
+	// 	}
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	final int lhsNV = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = lhsNV * rhsNV;
+	// 	final int nCol = lhs.getNumValues();
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final int defL = lhsNV - 1;
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = _indexes.getIterator();
+
+	// 	int i = 0;
+	// 	int col;
+	// 	int row;
+	// 	for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
+	// 		if(lIt.value() == i)
+	// 			col = lhs.getIndex(lIt.getDataIndexAndIncrement());
+	// 		else
+	// 			col = defL;
+	// 		if(rIt.value() == i) {
+	// 			row = 0;
+	// 			rIt.next();
+	// 		}
+	// 		else
+	// 			row = 1;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	if(lIt.hasNext()) {
+	// 		row = 0;
+	// 		for(; i < this._numRows && lIt.hasNext(); i++) {
+	// 			if(lIt.value() == i)
+	// 				col = lhs.getIndex(lIt.getDataIndexAndIncrement());
+	// 			else
+	// 				col = defL;
+
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	if(rIt.hasNext()) {
+	// 		col = defL;
+	// 		for(; i < this._numRows && rIt.hasNext(); i++) {
+	// 			if(rIt.value() == i) {
+	// 				row = 0;
+	// 				rIt.next();
+	// 			}
+	// 			else
+	// 				row = 1;
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	ag.increment(defL, this._numRows - i);
+
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	final int lhsNV = lhs.getNumValues();
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int retSize = lhsNV * rhsNV;
+	// 	final int nCol = lhs.getNumValues();
+	// 	IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	;
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = _indexes.getIterator();
+
+	// 	int i = 0;
+	// 	int col;
+	// 	int row;
+	// 	for(; i < this._numRows && lIt.hasNext() && rIt.hasNext(); i++) {
+	// 		if(lIt.value() == i) {
+	// 			col = 1;
+	// 			lIt.next();
+	// 		}
+	// 		else
+	// 			col = 0;
+	// 		if(rIt.value() == i) {
+	// 			row = 1;
+	// 			rIt.next();
+	// 		}
+	// 		else
+	// 			row = 0;
+	// 		ag.increment(col + row * nCol);
+	// 	}
+
+	// 	if(lIt.hasNext()) {
+	// 		row = 1;
+	// 		for(; i < _numRows && lIt.hasNext(); i++) {
+	// 			if(lIt.value() == i) {
+	// 				col = 1;
+	// 				lIt.next();
+	// 			}
+	// 			else
+	// 				col = 0;
+
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	if(rIt.hasNext()) {
+	// 		col = 1;
+	// 		for(; i < _numRows && rIt.hasNext(); i++) {
+	// 			if(rIt.value() == i) {
+	// 				row = 1;
+	// 				rIt.next();
+	// 			}
+	// 			else
+	// 				row = 0;
+	// 			ag.increment(col + row * nCol);
+	// 		}
+	// 	}
+
+	// 	ag.increment(0, _numRows - i);
+	// 	return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -585,7 +639,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
 				}
 				else if(itThat.value() < itThis.value())
 					itThat.next();
-				else{
+				else {
 					itThis.next();
 					// that._dict.addToEntry(ret, defThat, 0, nCol);
 				}
@@ -677,8 +731,4 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
 	}
 
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index 7e68e17..ff9da0e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -30,8 +30,6 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -88,21 +86,14 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(_indexes.getSize() * _colIndexes.length + target.getNonZeros());
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int nCol = _colIndexes.length;
 		final int tCol = target.getNumColumns();
-		final double[] values = getValues();
 		final int offTCorrected = offT - rl;
 		final double[] c = target.getDenseBlockValues();
 
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		AIterator it = _indexes.getIterator(rl);
 
 		while(it.hasNext() && it.value() < ru) {
 			int rc = (offTCorrected + it.value()) * tCol;
@@ -114,37 +105,42 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		throw new NotImplementedException("Not Implemented");
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock values) {
+		throw new NotImplementedException();
 	}
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		final double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-		final AIterator it = _indexes.getIterator();
-		while(it.hasNext()) {
-			c[it.value()] += values[_colIndexes.length + colpos];
-			it.next();
-		}
-		target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	final double[] c = target.getDenseBlockValues();
+	// 	final double[] values = getValues();
+	// 	final AIterator it = _indexes.getIterator();
+	// 	while(it.hasNext()) {
+	// 		c[it.value()] += values[_colIndexes.length + colpos];
+	// 		it.next();
+	// 	}
+	// 	target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+	// }
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final double[] values = getValues();
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
-		while(it.hasNext() && it.value() < ru) {
-			c[it.value() - rl] += values[colpos];
-			it.next();
-		}
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// 	final double[] values = getValues();
+	// 	final AIterator it = _indexes.getIterator(rl);
+	// 	while(it.hasNext() && it.value() < ru) {
+	// 		c[it.value() - rl] += values[colpos];
+	// 		it.next();
+	// 	}
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -152,8 +148,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 		if(ix < 0)
 			throw new RuntimeException("Column index " + c + " not in group.");
 
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(r);
+		final AIterator it = _indexes.getIterator(r);
 		if(it.value() == r)
 			return _dict.getValue(ix);
 		else
@@ -164,8 +159,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	@Override
 	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
 		final int nCol = _colIndexes.length;
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 		while(it.hasNext() && it.value() < ru) {
 			rnnz[it.value() - rl] += nCol;
 			it.next();
@@ -173,10 +167,9 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		final double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 		while(it.hasNext() && it.value() < ru) {
 			c[it.value()] += vals;
 			it.next();
@@ -187,8 +180,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		final double vals = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 		int rix = rl;
 		for(; rix < ru && it.hasNext(); rix++) {
 			if(it.value() != rix)
@@ -213,8 +205,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 
 	@Override
 	public int[] getCounts(int rl, int ru, int[] counts) {
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 
 		while(it.hasNext() && it.value() < ru) {
 			it.next();
@@ -226,46 +217,90 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 		return counts;
 	}
 
-	public double[] preAggregate(double[] a, int aRows) {
-		final double[] vals = allocDVector(getNumValues(), true);
-		final AIterator it = _indexes.getIterator();
-		if(aRows > 0) {
-			final int offT = _numRows * aRows;
-			while(it.hasNext()) {
-				final int i = it.value();
-				vals[0] += a[i + offT];
-				it.next();
-			}
-		}
+	// @Override
+	// public double[] preAggregate(double[] a, int row) {
+	// 	final double[] vals = allocDVector(getNumValues(), true);
+	// 	final AIterator it = _indexes.getIterator();
+	// 	if(row > 0) {
+	// 		final int offT = _numRows * row;
+	// 		while(it.hasNext()) {
+	// 			final int i = it.value();
+	// 			vals[0] += a[i + offT];
+	// 			it.next();
+	// 		}
+	// 	}
+	// 	else
+	// 		while(it.hasNext()) {
+	// 			final int i = it.value();
+	// 			vals[0] += a[i];
+	// 			it.next();
+	// 		}
+
+	// 	return vals;
+	// }
+
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// 	final double[] vals = allocDVector(getNumValues(), true);
+	// 	final int[] sbIndexes = sb.indexes(row);
+	// 	final double[] sparseV = sb.values(row);
+	// 	final AIterator it = _indexes.getIterator();
+	// 	final int sbEnd = sb.size(row) + sb.pos(row);
+
+	// 	int sbP = sb.pos(row);
+
+	// 	while(it.hasNext() && sbP < sbEnd) {
+	// 		if(it.value() == sbIndexes[sbP])
+	// 			vals[0] += sparseV[sbP++];
+	// 		if(sbP < sbEnd)
+	// 			it.skipTo(sbIndexes[sbP]);
+	// 		while(sbP < sbEnd && sbIndexes[sbP] < it.value())
+	// 			sbP++;
+	// 	}
+
+	// 	return vals;
+	// }
+
+	@Override
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru){
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
 		else
+			preAggregateDense(m, preAgg, rl, ru);
+	}
+
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			final AIterator it = _indexes.getIterator();
+			final int offLeft = rowLeft * _numRows;
 			while(it.hasNext()) {
 				final int i = it.value();
-				vals[0] += a[i];
+				preAV[offOut] += mV[offLeft + i];
 				it.next();
 			}
-
-		return vals;
+		}
 	}
 
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
-		final double[] vals = allocDVector(getNumValues(), true);
-		final int[] sbIndexes = sb.indexes(row);
-		final double[] sparseV = sb.values(row);
-		final AIterator it = _indexes.getIterator();
-		final int sbEnd = sb.size(row) + sb.pos(row);
-
-		int sbP = sb.pos(row);
-
-		while(it.hasNext() && sbP < sbEnd) {
-			if(it.value() == sbIndexes[sbP])
-				vals[0] += sparseV[sbP++];
-			if(sbP < sbEnd)
-				it.skipTo(sbIndexes[sbP]);
-			while(sbP < sbEnd && sbIndexes[sbP] < it.value())
-				sbP++;
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final AIterator it = _indexes.getIterator();
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final int[] aix = sb.indexes(rowLeft);
+			final double[] avals = sb.values(rowLeft);
+			for(int j = apos; j < alen; j++) {
+				it.skipTo(aix[j]);
+				if(it.value() == aix[j])
+					preAV[offOut] += avals[j];
+			}
 		}
-
-		return vals;
 	}
 
 	@Override
@@ -300,12 +335,12 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 	}
 
 	// private ADictionary swapEntries(ADictionary aDictionary) {
-	// 	double[] values = aDictionary.getValues().clone();
-	// 	double[] swap = new double[_colIndexes.length];
-	// 	System.arraycopy(values, 0, swap, 0, _colIndexes.length);
-	// 	System.arraycopy(values, _colIndexes.length, values, 0, _colIndexes.length);
-	// 	System.arraycopy(swap, 0, values, _colIndexes.length, _colIndexes.length);
-	// 	return new Dictionary(values);
+	// double[] values = aDictionary.getValues().clone();
+	// double[] swap = new double[_colIndexes.length];
+	// System.arraycopy(values, 0, swap, 0, _colIndexes.length);
+	// System.arraycopy(values, _colIndexes.length, values, 0, _colIndexes.length);
+	// System.arraycopy(swap, 0, values, _colIndexes.length, _colIndexes.length);
+	// return new Dictionary(values);
 	// }
 
 	@Override
@@ -346,87 +381,87 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 		return sb.toString();
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator it = _indexes.getIterator();
-
-		while(it.hasNext()) {
-			final int col = lhs._data.getIndex(it.value());
-			ag.increment(col);
-		}
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator it = _indexes.getIterator();
 
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// 	while(it.hasNext()) {
+	// 		final int col = lhs._data.getIndex(it.value());
+	// 		ag.increment(col);
+	// 	}
+	// 	return ag;
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = this._indexes.getIterator();
-
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value()) {
-				ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()));
-				rIt.next();
-			}
-			else if(lIt.value() < rIt.value())
-				lIt.next();
-			else
-				rIt.next();
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-		return ag;
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		// we always know that there is only one value in each column group.
-		int[] ret = new int[1];
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = this._indexes.getIterator();
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value()) {
-				ret[0]++;
-				lIt.next();
-				rIt.next();
-			}
-			else if(lIt.value() < rIt.value())
-				lIt.next();
-			else
-				rIt.next();
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// 	final int rhsNV = this.getNumValues();
+	// 	final int nCol = lhs.getNumValues();
+	// 	final int retSize = nCol * rhsNV;
+
+	// 	final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = this._indexes.getIterator();
+
+	// 	while(lIt.hasNext() && rIt.hasNext())
+	// 		if(lIt.value() == rIt.value()) {
+	// 			ag.increment(lhs.getIndex(lIt.getDataIndexAndIncrement()));
+	// 			rIt.next();
+	// 		}
+	// 		else if(lIt.value() < rIt.value())
+	// 			lIt.next();
+	// 		else
+	// 			rIt.next();
+
+	// 	return ag;
+	// }
 
-		return PreAggregateFactory.ag(ret);
-	}
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// 	// we always know that there is only one value in each column group.
+	// 	int[] ret = new int[1];
+	// 	final AIterator lIt = lhs._indexes.getIterator();
+	// 	final AIterator rIt = this._indexes.getIterator();
+	// 	while(lIt.hasNext() && rIt.hasNext())
+	// 		if(lIt.value() == rIt.value()) {
+	// 			ret[0]++;
+	// 			lIt.next();
+	// 			rIt.next();
+	// 		}
+	// 		else if(lIt.value() < rIt.value())
+	// 			lIt.next();
+	// 		else
+	// 			rIt.next();
+
+	// 	return PreAggregateFactory.ag(ret);
+	// }
 
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// 	throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// 		+ this.getClass().getSimpleName());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -440,7 +475,19 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 
 	@Override
 	public Dictionary preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
-		throw new NotImplementedException();
+		final AIterator itThat = that._indexes.getIterator();
+		final AIterator itThis = that._indexes.getIterator();
+		final int nCol = that._colIndexes.length;
+
+		while(itThat.hasNext() && itThis.hasNext()) {
+			final int v = itThat.value();
+			if(v == itThis.skipTo(v))
+				that._dict.addToEntry(ret, that.getIndex(itThat.getDataIndex()), 0, nCol);
+
+			itThat.next();
+		}
+
+		return ret;
 	}
 
 	@Override
@@ -464,8 +511,4 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 		throw new NotImplementedException();
 	}
 
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index f04f1a3..036d40e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -25,8 +25,6 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.DMLCompressionException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
@@ -34,8 +32,6 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -76,19 +72,14 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, int[] indexes, AMapToData data,
 		int[] cachedCounts) {
 		super(colIndices, numRows, dict, cachedCounts);
-		if(data == null)
-			throw new DMLCompressionException("data null input In SDC Construction");
 		_indexes = OffsetFactory.create(indexes, numRows);
 		_data = data;
 		_zeros = true;
-
 	}
 
 	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
 		super(colIndices, numRows, dict, cachedCounts);
-		if(data == null)
-			throw new DMLCompressionException("data null input In SDC Construction");
 		_indexes = offsets;
 		_data = data;
 		_zeros = true;
@@ -105,22 +96,14 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		decompressToBlockUnSafe(target, rl, ru, offT);
-		target.setNonZeros(getNumberNonZeros());
-	}
-
-	@Override
-	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
+	protected void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values) {
 		final int nCol = _colIndexes.length;
 		final int tCol = target.getNumColumns();
-		final double[] values = getValues();
 		final int offTCorrected = offT - rl;
 		final double[] c = target.getDenseBlockValues();
-
-		AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
-
+		AIterator it = _indexes.getIterator(rl);
+		offT = offT * tCol;
 		while(it.hasNext() && it.value() < ru) {
 			int rc = (offTCorrected + it.value()) * tCol;
 			int offC = getIndex(it.getDataIndexAndIncrement()) * nCol;
@@ -131,33 +114,55 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	protected void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock sb) {
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+		final int tCol = target.getNumColumns();
+		final int offTCorrected = offT - rl;
 		final double[] c = target.getDenseBlockValues();
-		final double[] values = getValues();
-		final AIterator it = _indexes.getIterator();
-		while(it.hasNext())
-			c[it.value()] += values[getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colpos];
-		target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+		AIterator it = _indexes.getIterator(rl);
+		while(it.hasNext() && it.value() < ru) {
+			final int rc = (offTCorrected + it.value()) * tCol;
+			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
+			if(sb.isEmpty(dictIndex))
+				continue;
+
+			final int apos = sb.pos(dictIndex);
+			final int alen = sb.size(dictIndex) + apos;
+			final double[] avals = sb.values(dictIndex);
+			final int[] aix = sb.indexes(dictIndex);
+			for(int j = apos; j < alen; j++)
+				c[rc + _colIndexes[aix[j]]] += avals[j];
+		}
 	}
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// throw new NotImplementedException();
+	// }
 
-	@Override
-	public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
-		final double[] values = getValues();
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
-		while(it.hasNext() && it.value() < ru)
-			c[it.value() - rl] += values[getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colpos];
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// final double[] c = target.getDenseBlockValues();
+	// final double[] values = getValues();
+	// final AIterator it = _indexes.getIterator();
+	// while(it.hasNext())
+	// c[it.value()] += values[getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colpos];
+	// target.setNonZeros(getNumberNonZeros() / _colIndexes.length);
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// throw new NotImplementedException();
+	// }
+
+	// @Override
+	// public void decompressColumnToBlock(double[] c, int colpos, int rl, int ru) {
+	// final double[] values = getValues();
+	// final AIterator it = _indexes.getIterator(rl);
+	// while(it.hasNext() && it.value() < ru)
+	// c[it.value() - rl] += values[getIndex(it.getDataIndexAndIncrement()) * _colIndexes.length + colpos];
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -165,8 +170,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
 		if(ix < 0)
 			throw new RuntimeException("Column index " + c + " not in group.");
 
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(r);
+		final AIterator it = _indexes.getIterator(r);
 		if(it.value() == r)
 			return _dict.getValue(getIndex(it.getDataIndex()) * _colIndexes.length + ix);
 		else
@@ -177,8 +181,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	@Override
 	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
 		final int nCol = _colIndexes.length;
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 		while(it.hasNext() && it.value() < ru) {
 			rnnz[it.value() - rl] += nCol;
 			it.next();
@@ -186,10 +189,9 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean) {
+	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
 		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 		while(it.hasNext() && it.value() < ru)
 			c[it.value()] += vals[getIndex(it.getDataIndexAndIncrement())];
 	}
@@ -197,8 +199,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 
 		int rix = rl;
 		for(; rix < ru && it.hasNext(); rix++) {
@@ -222,8 +223,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	public int[] getCounts(int rl, int ru, int[] counts) {
 
 		int i = rl;
-		final AIterator it = _indexes.getIterator();
-		it.skipTo(rl);
+		final AIterator it = _indexes.getIterator(rl);
 
 		int zeros = 0;
 		while(it.hasNext() && it.value() < ru) {
@@ -242,46 +242,87 @@ public class ColGroupSDCZeros extends ColGroupValue {
 		return _data.getIndex(r);
 	}
 
-	@Override
-	public double[] preAggregate(double[] a, int aRows) {
-		final double[] vals = allocDVector(getNumValues(), true);
-		final AIterator it = _indexes.getIterator();
-		if(aRows > 0) {
-			final int offT = _numRows * aRows;
-			while(it.hasNext()) {
-				final int i = it.value();
-				vals[getIndex(it.getDataIndexAndIncrement())] += a[i + offT];
-			}
-		}
+	// @Override
+	// public double[] preAggregate(double[] a, int aRows) {
+	// final double[] vals = allocDVector(getNumValues(), true);
+	// final AIterator it = _indexes.getIterator();
+	// if(aRows > 0) {
+	// final int offT = _numRows * aRows;
+	// while(it.hasNext()) {
+	// final int i = it.value();
+	// vals[getIndex(it.getDataIndexAndIncrement())] += a[i + offT];
+	// }
+	// }
+	// else
+	// while(it.hasNext()) {
+	// final int i = it.value();
+	// vals[getIndex(it.getDataIndexAndIncrement())] += a[i];
+	// }
+
+	// return vals;
+	// }
+
+	// @Override
+	// public double[] preAggregateSparse(SparseBlock sb, int row) {
+	// final double[] vals = allocDVector(getNumValues(), true);
+	// final int[] sbIndexes = sb.indexes(row);
+	// final double[] sparseV = sb.values(row);
+	// final AIterator it = _indexes.getIterator();
+	// final int sbEnd = sb.size(row) + sb.pos(row);
+
+	// int sbP = sb.pos(row);
+
+	// while(it.hasNext() && sbP < sbEnd) {
+	// if(it.value() == sbIndexes[sbP])
+	// vals[getIndex(it.getDataIndexAndIncrement())] += sparseV[sbP++];
+	// if(sbP < sbEnd)
+	// it.skipTo(sbIndexes[sbP]);
+	// while(sbP < sbEnd && sbIndexes[sbP] < it.value())
+	// sbP++;
+	// }
+
+	// return vals;
+	// }
+
+	@Override
+	protected void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		if(m.isInSparseFormat())
+			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
 		else
+			preAggregateDense(m, preAgg, rl, ru);
+	}
+
+	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final double[] mV = m.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			final AIterator it = _indexes.getIterator();
+			final int offLeft = rowLeft * _numRows;
 			while(it.hasNext()) {
 				final int i = it.value();
-				vals[getIndex(it.getDataIndexAndIncrement())] += a[i];
+				preAV[offOut + getIndex(it.getDataIndexAndIncrement())] += mV[offLeft + i];
 			}
-
-		return vals;
+		}
 	}
 
-	@Override
-	public double[] preAggregateSparse(SparseBlock sb, int row) {
-		final double[] vals = allocDVector(getNumValues(), true);
-		final int[] sbIndexes = sb.indexes(row);
-		final double[] sparseV = sb.values(row);
-		final AIterator it = _indexes.getIterator();
-		final int sbEnd = sb.size(row) + sb.pos(row);
-
-		int sbP = sb.pos(row);
-
-		while(it.hasNext() && sbP < sbEnd) {
-			if(it.value() == sbIndexes[sbP])
-				vals[getIndex(it.getDataIndexAndIncrement())] += sparseV[sbP++];
-			if(sbP < sbEnd)
-				it.skipTo(sbIndexes[sbP]);
-			while(sbP < sbEnd && sbIndexes[sbP] < it.value())
-				sbP++;
+	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
+		final double[] preAV = preAgg.getDenseBlockValues();
+		final int numVals = getNumValues();
+		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
+			if(sb.isEmpty(rowLeft))
+				continue;
+			final AIterator it = _indexes.getIterator();
+			final int apos = sb.pos(rowLeft);
+			final int alen = sb.size(rowLeft) + apos;
+			final int[] aix = sb.indexes(rowLeft);
+			final double[] avals = sb.values(rowLeft);
+			for(int j = apos; j < alen; j++) {
+				it.skipTo(aix[j]);
+				if(it.value() == aix[j])
+					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += avals[j];
+			}
 		}
-
-		return vals;
 	}
 
 	@Override
@@ -325,7 +366,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
 		_indexes = OffsetFactory.readIn(in);
-		_data = MapToFactory.readIn(in, getNumValues());
+		_data = MapToFactory.readIn(in);
 	}
 
 	@Override
@@ -358,132 +399,132 @@ public class ColGroupSDCZeros extends ColGroupValue {
 		return sb.toString();
 	}
 
-	@Override
-	public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator it = _indexes.getIterator();
-
-		while(it.hasNext()) {
-			final int col = lhs._data.getIndex(it.value());
-			final int row = getIndex(it.getDataIndexAndIncrement());
-			ag.increment(col + row * nCol);
-		}
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-
-		final int defL = nCol - 1;
-		final int retSize = nCol * rhsNV;
-
-		IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		AIterator lIt = lhs._indexes.getIterator();
-		AIterator rIt = this._indexes.getIterator();
-
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value())
-				ag.increment(
-					lhs.getIndex(lIt.getDataIndexAndIncrement()) + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
-			else if(lIt.value() > rIt.value())
-				ag.increment(defL + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
-			else
-				lIt.next();
-
-		while(rIt.hasNext())
-			ag.increment(defL + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value())
-				ag.increment(
-					lhs.getIndex(lIt.getDataIndexAndIncrement()) + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
-			else if(lIt.value() < rIt.value())
-				lIt.next();
-			else
-				rIt.next();
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
-		final int rhsNV = this.getNumValues();
-		final int nCol = lhs.getNumValues();
-		final int retSize = nCol * rhsNV;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-		final AIterator lIt = lhs._indexes.getIterator();
-		final AIterator rIt = _indexes.getIterator();
-
-		while(lIt.hasNext() && rIt.hasNext())
-			if(lIt.value() == rIt.value()) {
-				ag.increment(getIndex(rIt.getDataIndexAndIncrement()));
-				lIt.next();
-			}
-			else if(lIt.value() < rIt.value())
-				lIt.next();
-			else
-				rIt.next();
-
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
-		final int NVR = this.getNumValues();
-		final int NVL = lhs.getNumValues();
-		final int retSize = NVR * NVL;
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final IPreAggregate ag = PreAggregateFactory.ag(retSize);
-
-		for(int kl = 0; kl < NVL; kl++) {
-			final AIterator rIt = _indexes.getIterator();
-			final int bOffL = lhs._ptr[kl];
-			final int bLenL = lhs.len(kl);
-			for(int bixL = 0, offL = 0, sLenL = 0; rIt.hasNext() && bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
-				sLenL = lhs._data[bOffL + bixL];
-				for(int i = 1; rIt.hasNext() && i <= sLenL; i++) {
-					final int col = offL + lhs._data[bOffL + bixL + i];
-					rIt.skipTo(col);
-					if(rIt.value() == col)
-						ag.increment(kl + getIndex(rIt.getDataIndexAndIncrement()) * NVL);
-
-				}
-			}
-		}
-		return ag;
-	}
-
-	@Override
-	public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// @Override
+	// public IPreAggregate preAggregateDDC(ColGroupDDC lhs) {
+	// final int rhsNV = this.getNumValues();
+	// final int nCol = lhs.getNumValues();
+	// final int retSize = nCol * rhsNV;
+	// final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// final AIterator it = _indexes.getIterator();
+
+	// while(it.hasNext()) {
+	// final int col = lhs._data.getIndex(it.value());
+	// final int row = getIndex(it.getDataIndexAndIncrement());
+	// ag.increment(col + row * nCol);
+	// }
+	// return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDC(ColGroupSDC lhs) {
+	// final int rhsNV = this.getNumValues();
+	// final int nCol = lhs.getNumValues();
+
+	// final int defL = nCol - 1;
+	// final int retSize = nCol * rhsNV;
+
+	// IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// AIterator lIt = lhs._indexes.getIterator();
+	// AIterator rIt = this._indexes.getIterator();
+
+	// while(lIt.hasNext() && rIt.hasNext())
+	// if(lIt.value() == rIt.value())
+	// ag.increment(
+	// lhs.getIndex(lIt.getDataIndexAndIncrement()) + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
+	// else if(lIt.value() > rIt.value())
+	// ag.increment(defL + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
+	// else
+	// lIt.next();
+
+	// while(rIt.hasNext())
+	// ag.increment(defL + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
+
+	// return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs) {
+	// throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// + this.getClass().getSimpleName());
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs) {
+	// final int rhsNV = this.getNumValues();
+	// final int nCol = lhs.getNumValues();
+	// final int retSize = nCol * rhsNV;
+
+	// final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// final AIterator lIt = lhs._indexes.getIterator();
+	// final AIterator rIt = _indexes.getIterator();
+
+	// while(lIt.hasNext() && rIt.hasNext())
+	// if(lIt.value() == rIt.value())
+	// ag.increment(
+	// lhs.getIndex(lIt.getDataIndexAndIncrement()) + getIndex(rIt.getDataIndexAndIncrement()) * nCol);
+	// else if(lIt.value() < rIt.value())
+	// lIt.next();
+	// else
+	// rIt.next();
+
+	// return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs) {
+	// final int rhsNV = this.getNumValues();
+	// final int nCol = lhs.getNumValues();
+	// final int retSize = nCol * rhsNV;
+	// final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+	// final AIterator lIt = lhs._indexes.getIterator();
+	// final AIterator rIt = _indexes.getIterator();
+
+	// while(lIt.hasNext() && rIt.hasNext())
+	// if(lIt.value() == rIt.value()) {
+	// ag.increment(getIndex(rIt.getDataIndexAndIncrement()));
+	// lIt.next();
+	// }
+	// else if(lIt.value() < rIt.value())
+	// lIt.next();
+	// else
+	// rIt.next();
+
+	// return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateOLE(ColGroupOLE lhs) {
+	// final int NVR = this.getNumValues();
+	// final int NVL = lhs.getNumValues();
+	// final int retSize = NVR * NVL;
+	// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+	// final IPreAggregate ag = PreAggregateFactory.ag(retSize);
+
+	// for(int kl = 0; kl < NVL; kl++) {
+	// final AIterator rIt = _indexes.getIterator();
+	// final int bOffL = lhs._ptr[kl];
+	// final int bLenL = lhs.len(kl);
+	// for(int bixL = 0, offL = 0, sLenL = 0; rIt.hasNext() && bixL < bLenL; bixL += sLenL + 1, offL += blksz) {
+	// sLenL = lhs._data[bOffL + bixL];
+	// for(int i = 1; rIt.hasNext() && i <= sLenL; i++) {
+	// final int col = offL + lhs._data[bOffL + bixL + i];
+	// rIt.skipTo(col);
+	// if(rIt.value() == col)
+	// ag.increment(kl + getIndex(rIt.getDataIndexAndIncrement()) * NVL);
+
+	// }
+	// }
+	// }
+	// return ag;
+	// }
+
+	// @Override
+	// public IPreAggregate preAggregateRLE(ColGroupRLE lhs) {
+	// throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// + this.getClass().getSimpleName());
+	// }
 
 	@Override
 	public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
@@ -549,8 +590,4 @@ public class ColGroupSDCZeros extends ColGroupValue {
 		throw new NotImplementedException();
 	}
 
-	@Override
-	public MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
-		throw new NotImplementedException();
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
index faa3bb3..8f5d59d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
@@ -28,7 +28,7 @@ import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
-public class ColGroupSizes {
+public final class ColGroupSizes {
 	protected static final Log LOG = LogFactory.getLog(ColGroupSizes.class.getName());
 
 	public static long estimateInMemorySizeGroup(int nrColumns) {
@@ -44,7 +44,6 @@ public class ColGroupSizes {
 	public static long estimateInMemorySizeGroupValue(int nrColumns, int nrValues, double tupleSparsity,
 		boolean lossy) {
 		long size = estimateInMoemorySizeCompressedColumn(nrColumns);
-		// LOG.error("MemorySize Group Value: " + nrColumns + " " + nrValues + " " + lossy);
 		size += 8; // Dictionary Reference.
 		size += 8; // Counts reference
 		size += 1; // _zeros boolean reference
@@ -56,7 +55,6 @@ public class ColGroupSizes {
 
 	public static long estimateInMemorySizeDDC(int nrCols, int numTuples, int dataLength, double tupleSparsity,
 		boolean lossy) {
-		// LOG.error("Arguments for DDC memory Estimate " + nrCols + " " + numTuples + " " + dataLength + " " + lossy);
 		long size = estimateInMemorySizeGroupValue(nrCols, numTuples, tupleSparsity, lossy);
 		size += MapToFactory.estimateInMemorySize(dataLength, numTuples);
 		return size;
@@ -64,7 +62,6 @@ public class ColGroupSizes {
 
 	public static long estimateInMemorySizeOffset(int nrColumns, int nrValues, int pointers, int offsetLength,
 		double tupleSparsity, boolean lossy) {
-		// LOG.error("Offset Size: " + nrColumns + " " + nrValues + " " + pointers + " " + offsetLength);
 		long size = estimateInMemorySizeGroupValue(nrColumns, nrValues, tupleSparsity, lossy);
 		size += MemoryEstimates.intArrayCost(pointers);
 		size += MemoryEstimates.charArrayCost(offsetLength);
@@ -73,17 +70,14 @@ public class ColGroupSizes {
 
 	public static long estimateInMemorySizeOLE(int nrColumns, int nrValues, int offsetLength, int nrRows,
 		double tupleSparsity, boolean lossy) {
-		// LOG.error(nrColumns + " " + nrValues + " " + offsetLength + " " + nrRows + " " + lossy);
 		nrColumns = nrColumns > 0 ? nrColumns : 1;
 		offsetLength += (nrRows / CompressionSettings.BITMAP_BLOCK_SZ) * 2;
-		long size = estimateInMemorySizeOffset(nrColumns, nrValues, nrValues  + 1, offsetLength,
-			tupleSparsity, lossy);
+		long size = estimateInMemorySizeOffset(nrColumns, nrValues, nrValues + 1, offsetLength, tupleSparsity, lossy);
 		return size;
 	}
 
 	public static long estimateInMemorySizeRLE(int nrColumns, int nrValues, int nrRuns, int nrRows,
 		double tupleSparsity, boolean lossy) {
-		// LOG.error("RLE Size: " + nrColumns + " " + nrValues + " " + nrRuns + " " + nrRows);
 		int offsetLength = (nrRuns) * 2;
 		long size = estimateInMemorySizeOffset(nrColumns, nrValues, (nrValues) + 1, offsetLength, tupleSparsity, lossy);
 
@@ -94,10 +88,9 @@ public class ColGroupSizes {
 		boolean largestOffIsZero, boolean containNoZeroValues, double tupleSparsity, boolean lossy) {
 		long size = estimateInMemorySizeGroupValue(nrColumns,
 			nrValues + (largestOffIsZero || containNoZeroValues ? 0 : 1), tupleSparsity, lossy);
-		// LOG.error("SDC Estimation values: " + nrColumns + " " + nrValues + " " + nrRows + " " + largestOff);
-		size += OffsetFactory.estimateInMemorySize(nrRows - largestOff - 1, nrRows);
+		size += OffsetFactory.estimateInMemorySize(nrRows - largestOff, nrRows);
 		if(nrValues > 1)
-			size += MapToFactory.estimateInMemorySize(nrRows - largestOff, nrValues);
+			size += MapToFactory.estimateInMemorySize(nrRows - largestOff, nrValues - 1);
 		return size;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index 562e4d8..6e8524e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
@@ -77,7 +78,7 @@ public class ColGroupUncompressed extends AColGroup {
 			final int col = colIndicesList[0];
 			if(transposed) {
 				_data = rawBlock.slice(col, col, 0, rawBlock.getNumColumns() - 1);
-				_data = LibMatrixReorg.transposeInPlace(_data, 1);
+				_data = LibMatrixReorg.transposeInPlace(_data, InfrastructureAnalyzer.getLocalParallelism());
 			}
 			else
 				_data = rawBlock.slice(0, rawBlock.getNumRows() - 1, col, col);
@@ -158,42 +159,8 @@ public class ColGroupUncompressed extends AColGroup {
 
 	@Override
 	public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
-		double[] c = target.getDenseBlockValues();
-		final int nCol = _colIndexes.length;
-		final int tCol = target.getNumColumns();
-		long nnz = 0;
-		if(_data.isEmpty())
-			return;
-		else if(_data.isInSparseFormat()) {
-			SparseBlock sb = _data.getSparseBlock();
-			for(int row = rl; row < ru; row++, offT += tCol) {
-				if(!sb.isEmpty(row)) {
-					int apos = sb.pos(row);
-					int alen = sb.size(row) + apos;
-					int[] aix = sb.indexes(row);
-					double[] avals = sb.values(row);
-					nnz += alen;
-					for(int col = apos; col < alen; col++) {
-						c[_colIndexes[aix[col]] + offT] += avals[col];
-					}
-				}
-			}
-		}
-		else {
-			double[] values = _data.getDenseBlockValues();
-			offT = offT * tCol;
-			int offS = rl * nCol;
-			for(int row = rl; row < ru; row++, offT += tCol, offS += nCol) {
-				for(int j = 0; j < nCol; j++) {
-					final double v = values[offS + j];
-					if(v != 0) {
-						c[offT + _colIndexes[j]] += v;
-						nnz++;
-					}
-				}
-			}
-		}
-		target.setNonZeros(nnz + target.getNonZeros());
+		decompressToBlockUnSafe(target, rl, ru, offT);
+		target.setNonZeros(_data.getNonZeros() + target.getNonZeros());
 	}
 
 	@Override
@@ -228,43 +195,43 @@ public class ColGroupUncompressed extends AColGroup {
 		}
 	}
 
-	@Override
-	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	// @Override
+	// public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos) {
-		double[] c = target.getDenseBlockValues();
-		int nnz = 0;
-		int off = colpos;
-		if(_data.isInSparseFormat()) {
-			for(int i = 0; i < _data.getNumRows(); i++) {
-				c[i] += _data.quickGetValue(i, colpos);
-				if(c[i] != 0)
-					nnz++;
-			}
-		}
-		else {
-			double[] denseValues = _data.getDenseBlockValues();
-			for(int i = 0; i < _data.getNumRows(); i++, off += _colIndexes.length) {
-				c[i] += denseValues[off];
-				if(c[i] != 0)
-					nnz++;
-			}
-		}
-		target.setNonZeros(nnz);
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos) {
+	// 	double[] c = target.getDenseBlockValues();
+	// 	int nnz = 0;
+	// 	int off = colpos;
+	// 	if(_data.isInSparseFormat()) {
+	// 		for(int i = 0; i < _data.getNumRows(); i++) {
+	// 			c[i] += _data.quickGetValue(i, colpos);
+	// 			if(c[i] != 0)
+	// 				nnz++;
+	// 		}
+	// 	}
+	// 	else {
+	// 		double[] denseValues = _data.getDenseBlockValues();
+	// 		for(int i = 0; i < _data.getNumRows(); i++, off += _colIndexes.length) {
+	// 			c[i] += denseValues[off];
+	// 			if(c[i] != 0)
+	// 				nnz++;
+	// 		}
+	// 	}
+	// 	target.setNonZeros(nnz);
+	// }
 
-	@Override
-	public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	// @Override
+	// public void decompressColumnToBlock(MatrixBlock target, int colpos, int rl, int ru) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
 
-	@Override
-	public void decompressColumnToBlock(double[] target, int colpos, int rl, int ru) {
-		throw new NotImplementedException("Not Implemented");
-	}
+	// @Override
+	// public void decompressColumnToBlock(double[] target, int colpos, int rl, int ru) {
+	// 	throw new NotImplementedException("Not Implemented");
+	// }
 
 	@Override
 	public double get(int r, int c) {
@@ -275,46 +242,7 @@ public class ColGroupUncompressed extends AColGroup {
 			return _data.quickGetValue(r, ix);
 	}
 
-	// @Override
-	// public void rightMultByVector(double[] b, double[] c, int rl, int ru, double[] dictVals) {
-	// throw new NotImplementedException("Should not be called use other matrix function");
-	// }
-
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
-		// Pull out the relevant rows of the vector
-		int clen = _colIndexes.length;
-
-		MatrixBlock shortVector = new MatrixBlock(clen, 1, false);
-		shortVector.allocateDenseBlock();
-		double[] b = shortVector.getDenseBlockValues();
-		for(int colIx = 0; colIx < clen; colIx++)
-			b[colIx] = vector.quickGetValue(_colIndexes[colIx], 0);
-		shortVector.recomputeNonZeros();
-
-		// Multiply the selected columns by the appropriate parts of the vector
-		LibMatrixMult.matrixMult(_data, shortVector, result, rl, ru);
-	}
-
-	public void rightMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// Pull out the relevant rows of the vector
-
-		int clen = _colIndexes.length;
-		MatrixBlock subMatrix = new MatrixBlock(clen, matrix.getNumColumns(), false);
-		subMatrix.allocateDenseBlock();
-		double[] b = subMatrix.getDenseBlockValues();
-
-		for(int colIx = 0; colIx < clen; colIx++) {
-			int row = _colIndexes[colIx];
-			for(int col = 0; col < matrix.getNumColumns(); col++)
-				b[colIx * matrix.getNumColumns() + col] = matrix.quickGetValue(row, col);
-		}
-
-		subMatrix.setNonZeros(clen * matrix.getNumColumns());
-
-		// // Multiply the selected columns by the appropriate parts of the vector
-		LibMatrixMult.matrixMult(_data, subMatrix, result);
-	}
-
+	@Override
 	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
 
 		final MatrixBlock tmpRet = new MatrixBlock(ru - rl, _data.getNumColumns(), false);
@@ -328,14 +256,16 @@ public class ColGroupUncompressed extends AColGroup {
 		else if(tmpRet.isInSparseFormat()) {
 			final SparseBlock sb = tmpRet.getSparseBlock();
 			for(int rowIdx = 0; rowIdx < ru - rl; rowIdx++, offT += result.getNumColumns()) {
-				if(!sb.isEmpty(rowIdx)) {
-					final int apos = sb.pos(rowIdx);
-					final int alen = sb.size(rowIdx) + apos;
-					final int[] aix = sb.indexes(rowIdx);
-					final double[] avals = sb.values(rowIdx);
-					for(int col = apos; col < alen; col++)
-						resV[offT + _colIndexes[aix[col]]] += avals[col];
-				}
+				if(sb.isEmpty(rowIdx))
+					continue;
+
+				final int apos = sb.pos(rowIdx);
+				final int alen = sb.size(rowIdx) + apos;
+				final int[] aix = sb.indexes(rowIdx);
+				final double[] avals = sb.values(rowIdx);
+				for(int col = apos; col < alen; col++)
+					resV[offT + _colIndexes[aix[col]]] += avals[col];
+
 			}
 		}
 		else {
@@ -372,7 +302,7 @@ public class ColGroupUncompressed extends AColGroup {
 		else
 			_data.binaryOperations(op, that, resultBlock);
 
-		return new ColGroupUncompressed(_colIndexes, resultBlock, false);
+		return new ColGroupUncompressed(_colIndexes, resultBlock);
 	}
 
 	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] ret) {
@@ -531,11 +461,17 @@ public class ColGroupUncompressed extends AColGroup {
 		}
 		else {
 			double[] tmpV = tmp.getDenseBlockValues();
-			for(int row = 0, offRet = 0, offTmp = 0; row < tCol; row++, offRet += numColumns, offTmp += tCol)
+			for(int row = 0, offTmp = 0; row < tCol; row++, offTmp += tCol) {
+				final int offRet = _colIndexes[row] * numColumns;
 				for(int col = row; col < tCol; col++)
 					result[offRet + _colIndexes[col]] += tmpV[offTmp + col];
+			}
 		}
+	}
 
+	@Override
+	public void tsmm(double[] result, int numColumns, int idxStart, int idxEnd) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -543,7 +479,7 @@ public class ColGroupUncompressed extends AColGroup {
 		MatrixBlock newData = new MatrixBlock(_data.getNumRows(), _data.getNumColumns(), _data.isInSparseFormat());
 		// _data.copy(newData);
 		newData.copy(_data);
-		return new ColGroupUncompressed(_colIndexes, newData, false);
+		return new ColGroupUncompressed(_colIndexes, newData);
 	}
 
 	@Override
@@ -572,7 +508,7 @@ public class ColGroupUncompressed extends AColGroup {
 			return;
 		if(lhs instanceof ColGroupUncompressed) {
 			ColGroupUncompressed lhsUC = (ColGroupUncompressed) lhs;
-			MatrixBlock tmpRet = new MatrixBlock(_colIndexes.length, _colIndexes.length, 0);
+			MatrixBlock tmpRet = new MatrixBlock(lhs.getNumCols(), _colIndexes.length, 0);
 
 			if(lhsUC._data == this._data) {
 
@@ -583,7 +519,7 @@ public class ColGroupUncompressed extends AColGroup {
 				LOG.warn("Inefficient Left Matrix Multiplication with transpose of left hand side : t(l) %*% r");
 				MatrixBlock lhData = lhsUC._data;
 				MatrixBlock transposed = new MatrixBlock(lhData.getNumColumns(), lhData.getNumRows(), false);
-				LibMatrixReorg.transpose(lhData, transposed);
+				LibMatrixReorg.transpose(lhData, transposed, InfrastructureAnalyzer.getLocalParallelism());
 				transposed.setNonZeros(lhData.getNonZeros());
 				// do transposed left hand side, matrix multiplication.
 				LibMatrixMult.matrixMult(transposed, this._data, tmpRet);
@@ -594,46 +530,64 @@ public class ColGroupUncompressed extends AColGroup {
 				return;
 			else if(tmpRet.isInSparseFormat()) {
 				SparseBlock sb = tmpRet.getSparseBlock();
-				for(int rowIdx = 0, offT = 0; rowIdx < tmpRet.getNumRows(); rowIdx++, offT += result.getNumColumns()) {
-					if(!sb.isEmpty(rowIdx)) {
-						final int apos = sb.pos(rowIdx);
-						final int alen = sb.size(rowIdx) + apos;
-						final int[] aix = sb.indexes(rowIdx);
-						final double[] avals = sb.values(rowIdx);
-						for(int col = apos; col < alen; col++)
-							resV[offT + _colIndexes[aix[col]]] += avals[col];
-					}
+				for(int row = 0; row < lhs._colIndexes.length; row++) {
+					if(sb.isEmpty(row))
+						continue;
+					final int apos = sb.pos(row);
+					final int alen = sb.size(row) + apos;
+					final int[] aix = sb.indexes(row);
+					final double[] avals = sb.values(row);
+					final int offRes = lhs._colIndexes[row] * result.getNumColumns();
+					for(int col = apos; col < alen; col++)
+						resV[offRes + _colIndexes[aix[col]]] += avals[col];
 				}
 			}
 			else {
 				double[] tmpRetV = tmpRet.getDenseBlockValues();
-				for(int j = 0, offTemp = 0, offT = 0;
-					j < tmpRet.getNumRows();
-					j++, offTemp += _colIndexes.length, offT += result.getNumColumns()) {
-					for(int i = 0; i < _colIndexes.length; i++)
-						resV[offT + _colIndexes[i]] += tmpRetV[offTemp + i];
+				for(int row = 0; row < lhs._colIndexes.length; row++) {
+					final int offRes = lhs._colIndexes[row] * result.getNumColumns();
+					final int offTmp = lhs._colIndexes.length * row;
+					for(int col = 0; col < _colIndexes.length; col++) {
+						resV[offRes + _colIndexes[col]] += tmpRetV[offTmp + col];
+					}
 				}
 			}
-
 		}
 		else {
-
-			LOG.warn("Inefficient transpose of uncompressed to fit to"
+			LOG.warn("\nInefficient transpose of uncompressed to fit to"
 				+ " t(AColGroup) %*% UncompressedColGroup mult by colGroup uncompressed column"
-				+ " Currently solved by t(t(Uncompressed) %*% AColGroup");
-			MatrixBlock tmpTransposedResult = new MatrixBlock(result.getNumColumns(), result.getNumRows(), false);
+				+ "\nCurrently solved by t(t(Uncompressed) %*% AColGroup)");
+			MatrixBlock ucCG = getData();
+			// make a function that allows the result of the mult to be directly output to a temporary matrix.
+			MatrixBlock tmpTransposedResult = new MatrixBlock(ucCG.getNumColumns(), result.getNumColumns(), false);
 			tmpTransposedResult.allocateDenseBlock();
 
-			MatrixBlock ucCG = getData();
 			MatrixBlock tmp = new MatrixBlock(ucCG.getNumColumns(), ucCG.getNumRows(), ucCG.isInSparseFormat());
 			LibMatrixReorg.transpose(ucCG, tmp, InfrastructureAnalyzer.getLocalParallelism());
 			lhs.leftMultByMatrix(tmp, tmpTransposedResult);
+			tmpTransposedResult.setNonZeros(ucCG.getNumColumns() * result.getNumColumns());
 
 			final double[] resV = result.getDenseBlockValues();
-			final double[] tmpV = tmpTransposedResult.getDenseBlockValues();
-			for(int row = 0; row < result.getNumRows(); row++) {
-				for(int col = 0; col < result.getNumColumns(); col++) {
-					resV[row * result.getNumColumns() + col] += tmpV[col * result.getNumRows() + row];
+			final int[] lhsC = lhs._colIndexes;
+			final int[] rhsC = _colIndexes;
+
+			// allocate the resulting matrix into the correct result indexes.
+			// Note that the intermediate matrix is transposed, therefore the indexes are different than a normal
+			// allocation.
+
+			if(tmpTransposedResult.isEmpty())
+				return;
+			else if(tmpTransposedResult.isInSparseFormat())
+				throw new NotImplementedException();
+			else {
+				final double[] tmpV = tmpTransposedResult.getDenseBlockValues();
+				final int nCol = result.getNumColumns();
+
+				for(int row = 0; row < rhsC.length; row++) {
+					final int offR = rhsC[row];
+					final int offT = row * nCol;
+					for(int col = 0; col < lhsC.length; col++)
+						resV[offR + lhsC[col] * nCol] += tmpV[offT + lhsC[col]];
 				}
 			}
 		}
@@ -646,20 +600,52 @@ public class ColGroupUncompressed extends AColGroup {
 
 	@Override
 	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
-		MatrixBlock newData = _data.slice(0, _data.getNumRows() - 1, idStart, idEnd - 1, true);
-		if(newData.isEmpty())
-			return new ColGroupEmpty(outputCols, newData.getNumRows());
-		return new ColGroupUncompressed(outputCols, newData, false);
+		try {
+			MatrixBlock newData = _data.slice(0, _data.getNumRows() - 1, idStart, idEnd - 1, true);
+			if(newData.isEmpty())
+				return new ColGroupEmpty(outputCols, newData.getNumRows());
+			return new ColGroupUncompressed(outputCols, newData);
+		}
+		catch(Exception e) {
+			throw new DMLCompressionException("Error in slicing of uncompressed column group", e);
+		}
 	}
 
 	@Override
 	public AColGroup rightMultByMatrix(MatrixBlock right) {
-		int[] outputCols = new int[right.getNumColumns()];
+		final int nColR = right.getNumColumns();
+		int[] outputCols = new int[nColR];
 		for(int i = 0; i < outputCols.length; i++)
 			outputCols[i] = i;
-		MatrixBlock out = new MatrixBlock(_data.getNumRows(), right.getNumColumns(), true);
-		LibMatrixMult.matrixMult(_data, right, out, InfrastructureAnalyzer.getLocalParallelism());
-		return new ColGroupUncompressed(outputCols, out, false);
+		if(_data.isEmpty() || right.isEmpty())
+			return new ColGroupEmpty(outputCols, _data.getNumRows());
+		MatrixBlock subBlockRight;
+
+		if(right.isInSparseFormat()) {
+			subBlockRight = new MatrixBlock(_data.getNumColumns(), nColR, true);
+			subBlockRight.allocateSparseRowsBlock();
+			final SparseBlock sbR = right.getSparseBlock();
+			final SparseBlock subR = subBlockRight.getSparseBlock();
+			for(int i = 0; i < _colIndexes.length; i++)
+				subR.set(i, sbR.get(_colIndexes[i]), false);
+		}
+		else {
+			subBlockRight = new MatrixBlock(_data.getNumColumns(), nColR, false);
+			subBlockRight.allocateDenseBlock();
+			final double[] sbr = subBlockRight.getDenseBlockValues();
+			final double[] rightV = right.getDenseBlockValues();
+			for(int i = 0; i < _colIndexes.length; i++) {
+				final int offSubBlock = i * nColR;
+				final int offRight = _colIndexes[i] * nColR;
+				System.arraycopy(rightV, offRight, sbr, offSubBlock, nColR);
+			}
+		}
+		// Hack to force computation without having to count all non zeros.
+		subBlockRight.setNonZeros(_data.getNumColumns() * nColR);
+		MatrixBlock out = new MatrixBlock(_data.getNumRows(), nColR, false);
+		LibMatrixMult.matrixMult(_data, subBlockRight, out, InfrastructureAnalyzer.getLocalParallelism());
+		return new ColGroupUncompressed(outputCols, out);
+
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index 42bf407..af9ab37 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -34,16 +34,16 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
-import org.apache.sysds.runtime.compress.colgroup.pre.ArrPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -54,7 +54,7 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 public abstract class ColGroupValue extends ColGroupCompressed implements Cloneable {
 	private static final long serialVersionUID = 3786247536054353658L;
 
-	/** thread-local pairs of reusable temporary vectors for positions and values */
+	/** Thread-local pairs of reusable temporary vectors for positions and values */
 	private static ThreadLocal<Pair<int[], double[]>> memPool = new ThreadLocal<Pair<int[], double[]>>() {
 		@Override
 		protected Pair<int[], double[]> initialValue() {
@@ -85,11 +85,34 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		decompressToBlock(target, rl, ru, rl);
+	public final void decompressToBlockSafe(MatrixBlock target, int rl, int ru, int offT) {
+		decompressToBlockUnSafe(target, rl, ru, offT);
+		target.setNonZeros(getNumberNonZeros() + target.getNonZeros());
 	}
 
 	@Override
+	public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, int offT) {
+		if(_dict instanceof MatrixBlockDictionary) {
+			final MatrixBlockDictionary md = (MatrixBlockDictionary) _dict;
+			final MatrixBlock mb = md.getMatrixBlock();
+			if(mb.isEmpty())
+				return;
+			else if(mb.isInSparseFormat())
+				decompressToBlockUnSafeSparseDictionary(target, rl, ru, offT, mb.getSparseBlock());
+			else
+				decompressToBlockUnSafeDenseDictionary(target, rl, ru, offT, mb.getDenseBlockValues());
+		}
+		else
+			decompressToBlockUnSafeDenseDictionary(target, rl, ru, offT, _dict.getValues());
+	}
+
+	protected abstract void decompressToBlockUnSafeSparseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		SparseBlock sb);
+
+	protected abstract void decompressToBlockUnSafeDenseDictionary(MatrixBlock target, int rl, int ru, int offT,
+		double[] values);
+
+	@Override
 	public final int getNumValues() {
 		return _dict.getNumberOfValues(_colIndexes.length);
 	}
@@ -108,12 +131,12 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 		_dict.addMaxAndMin(ret, _colIndexes);
 	}
 
-	protected void setDictionary(ADictionary dict) {
+	protected final void setDictionary(ADictionary dict) {
 		_dict = dict;
 	}
 
 	@Override
-	public MatrixBlock getValuesAsBlock() {
+	public final MatrixBlock getValuesAsBlock() {
 		_dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
 		MatrixBlock ret = ((MatrixBlockDictionary) _dict).getMatrixBlock();
 		if(_zeros) {
@@ -175,11 +198,6 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 		return true;
 	}
 
-	@Override
-	protected int containsAllZeroTuple() {
-		return _dict.hasZeroTuple(_colIndexes.length);
-	}
-
 	protected final double sumValues(int valIx, double[] b, double[] dictVals) {
 		final int numCols = getNumCols();
 		final int valOff = valIx * numCols;
@@ -198,31 +216,6 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 		return val;
 	}
 
-	protected double[] preaggValues(int numVals, double[] b, double[] dictVals) {
-		return preaggValues(numVals, b, false, dictVals, 0);
-	}
-
-	protected double[] preaggValues(int numVals, double[] b, double[] dictVals, int off) {
-		return preaggValues(numVals, b, false, dictVals, off);
-	}
-
-	protected double[] preaggValues(int numVals, double[] b, boolean allocNew, double[] dictVals, int off) {
-		// + 1 to enable containing a zero value. which we have added at the length of
-		// the arrays index.
-		double[] ret = allocNew ? new double[numVals + 1] : allocDVector(numVals + 1, true);
-
-		if(_colIndexes.length == 1) {
-			for(int k = 0; k < numVals; k++)
-				ret[k] = dictVals[k] * b[_colIndexes[0] + off];
-		}
-		else {
-			for(int k = 0; k < numVals; k++)
-				ret[k] = sumValues(k, b, dictVals, off);
-		}
-
-		return ret;
-	}
-
 	private int[] getAggregateColumnsSetDense(double[] b, int cl, int cu, int cut) {
 		Set<Integer> aggregateColumnsSet = new HashSet<>();
 		final int retCols = (cu - cl);
@@ -243,28 +236,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 		return aggregateColumns;
 	}
 
-	private Pair<int[], double[]> preaggValuesFromDense(final int numVals, final double[] b, final int cl, final int cu,
-		final int cut) {
-
-		final int[] aggregateColumns = getAggregateColumnsSetDense(b, cl, cu, cut);
-		final double[] ret = new double[numVals * aggregateColumns.length];
-
-		for(int k = 0, off = 0;
-			k < numVals * _colIndexes.length;
-			k += _colIndexes.length, off += aggregateColumns.length) {
-			for(int h = 0; h < _colIndexes.length; h++) {
-				int idb = _colIndexes[h] * cut;
-				double v = _dict.getValue(k + h);
-				if(v != 0)
-					for(int i = 0; i < aggregateColumns.length; i++)
-						ret[off + i] += v * b[idb + aggregateColumns[i]];
-			}
-		}
-
-		return new ImmutablePair<>(aggregateColumns, ret);
-	}
-
-	private int[] getAggregateColumnsSetSparse(SparseBlock b) {
+	private int[] getAggregateColumnsSetSparse(SparseBlock b, int retCols) {
 		Set<Integer> aggregateColumnsSet = new HashSet<>();
 
 		for(int h = 0; h < _colIndexes.length; h++) {
@@ -275,6 +247,8 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 					aggregateColumnsSet.add(sIndexes[i]);
 				}
 			}
+			if(aggregateColumnsSet.size() == retCols)
+				break;
 		}
 
 		int[] aggregateColumns = aggregateColumnsSet.stream().mapToInt(x -> x).toArray();
@@ -282,12 +256,9 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 		return aggregateColumns;
 	}
 
-	private Pair<int[], double[]> preaggValuesFromSparse(int numVals, SparseBlock b, int cl, int cu, int cut) {
-
-		int[] aggregateColumns = getAggregateColumnsSetSparse(b);
-
-		double[] ret = new double[numVals * aggregateColumns.length];
-
+	private double[] preaggValuesFromSparse(int numVals, SparseBlock b, int[] aggregateColumns, int cl, int cu,
+		int cut) {
+		final double[] ret = new double[numVals * aggregateColumns.length];
 		for(int h = 0; h < _colIndexes.length; h++) {
 			int colIdx = _colIndexes[h];
 			if(!b.isEmpty(colIdx)) {
@@ -306,23 +277,10 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 				}
 			}
 		}
-		return new ImmutablePair<>(aggregateColumns, ret);
-	}
-
-	public Pair<int[], double[]> preaggForRightMultiplyValues(int numVals, MatrixBlock b, int cl, int cu, int cut) {
-		return b.isInSparseFormat() ? preaggValuesFromSparse(numVals, b.getSparseBlock(), cl, cu,
-			cut) : preaggValuesFromDense(numVals, b.getDenseBlockValues(), cl, cu, cut);
+		return ret;
 	}
 
-	// protected static double[] sparsePreaggValues(int numVals, double v, boolean allocNew, ADictionary dict) {
-	// double[] ret = allocNew ? new double[numVals + 1] : allocDVector(numVals + 1, true);
-
-	// for(int k = 0; k < numVals; k++)
-	// ret[k] = dictVals[k] * v;
-	// return ret;
-	// }
-
-	protected double computeMxx(double c, Builtin builtin) {
+	protected final double computeMxx(double c, Builtin builtin) {
 		if(_zeros)
 			c = builtin.execute(c, 0);
 		if(_dict != null)
@@ -331,7 +289,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 			return c;
 	}
 
-	protected void computeColMxx(double[] c, Builtin builtin) {
+	protected final void computeColMxx(double[] c, Builtin builtin) {
 		if(_zeros) {
 			for(int x = 0; x < _colIndexes.length; x++)
 				c[_colIndexes[x]] = builtin.execute(c[_colIndexes[x]], 0);
@@ -346,7 +304,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 * @param op scalar operation to perform
 	 * @return transformed copy of value metadata for this column group
 	 */
-	protected ADictionary applyScalarOp(ScalarOperator op) {
+	protected final ADictionary applyScalarOp(ScalarOperator op) {
 		return _dict.clone().apply(op);
 	}
 
@@ -362,7 +320,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 *                appended.
 	 * @return The new Dictionary containing the values.
 	 */
-	protected ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
+	protected final ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
 		return _dict.applyScalarOp(op, newVal, numCols);
 	}
 
@@ -375,18 +333,11 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 * @param left       Specify which side the operation is executed on.
 	 * @return The new Dictionary with values.
 	 */
-	public ADictionary applyBinaryRowOp(BinaryOperator op, double[] v, boolean sparseSafe, boolean left) {
+	protected final ADictionary applyBinaryRowOp(BinaryOperator op, double[] v, boolean sparseSafe, boolean left) {
 		return sparseSafe ? _dict.clone().applyBinaryRowOp(op, v, sparseSafe, _colIndexes, left) : _dict
 			.applyBinaryRowOp(op, v, sparseSafe, _colIndexes, left);
 	}
 
-	protected void setandExecute(double[] c, boolean square, double val, int rix) {
-		if(square)
-			c[rix] += val * val;
-		else
-			c[rix] += val;
-	}
-
 	public static void setupThreadLocalMemory(int len) {
 		if(memPool.get() == null || memPool.get().getLeft().length < len) {
 			Pair<int[], double[]> p = new ImmutablePair<>(new int[len], new double[len]);
@@ -400,7 +351,6 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 
 	protected static double[] allocDVector(int len, boolean reset) {
 		Pair<int[], double[]> p = memPool.get();
-
 		// sanity check for missing setup
 		if(p == null) {
 			return new double[len];
@@ -449,14 +399,13 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	@Override
-	public boolean isLossy() {
+	public final boolean isLossy() {
 		return _dict.isLossy();
 	}
 
 	@Override
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
-
 		_zeros = in.readBoolean();
 		_dict = DictionaryFactory.read(in);
 	}
@@ -465,9 +414,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	public void write(DataOutput out) throws IOException {
 		super.write(out);
 		out.writeBoolean(_zeros);
-
 		_dict.write(out);
-
 	}
 
 	@Override
@@ -487,7 +434,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 
 	public abstract int[] getCounts(int rl, int ru, int[] out);
 
-	protected void computeSum(double[] c, boolean square) {
+	protected final void computeSum(double[] c, boolean square) {
 		if(_dict != null)
 			if(square)
 				c[0] += _dict.sumsq(getCounts(), _colIndexes.length);
@@ -495,82 +442,47 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 				c[0] += _dict.sum(getCounts(), _colIndexes.length);
 	}
 
-	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru, boolean mean);
-
-	protected void computeColSums(double[] c, boolean square) {
+	protected final void computeColSums(double[] c, boolean square) {
 		_dict.colSum(c, getCounts(), _colIndexes, square);
 	}
 
-	protected abstract void computeRowMxx(double[] c, Builtin builtin, int rl, int ru);
-
-	protected Object clone() throws CloneNotSupportedException {
-		return super.clone();
-	}
-
-	public AColGroup copyAndSet(double[] newDictionary) {
+	protected Object clone() {
 		try {
-			ColGroupValue clone = (ColGroupValue) this.clone();
-			clone.setDictionary(new Dictionary(newDictionary));
-			return clone;
+			return super.clone();
 		}
 		catch(CloneNotSupportedException e) {
-			e.printStackTrace();
+			throw new DMLCompressionException("Error while cloning: " + getClass().getSimpleName(), e);
 		}
-		return null;
+	}
+
+	public AColGroup copyAndSet(double[] newDictionary) {
+		return copyAndSet(new Dictionary(newDictionary));
 	}
 
 	public AColGroup copyAndSet(ADictionary newDictionary) {
-		try {
-			ColGroupValue clone = (ColGroupValue) this.clone();
-			clone.setDictionary(newDictionary);
-			return clone;
-		}
-		catch(CloneNotSupportedException e) {
-			e.printStackTrace();
-		}
-		return null;
+		ColGroupValue clone = (ColGroupValue) this.clone();
+		clone.setDictionary(newDictionary);
+		return clone;
 	}
 
 	public AColGroup copyAndSet(int[] colIndexes, double[] newDictionary) {
-		try {
-			ColGroupValue clone = (ColGroupValue) this.clone();
-			clone.setDictionary(new Dictionary(newDictionary));
-			clone.setColIndices(colIndexes);
-			return clone;
-		}
-		catch(CloneNotSupportedException e) {
-			e.printStackTrace();
-		}
-		return null;
+		return copyAndSet(colIndexes, new Dictionary(newDictionary));
 	}
 
 	public AColGroup copyAndSet(int[] colIndexes, ADictionary newDictionary) {
-		try {
-			ColGroupValue clone = (ColGroupValue) this.clone();
-			clone.setDictionary(newDictionary);
-			clone.setColIndices(colIndexes);
-			return clone;
-		}
-		catch(CloneNotSupportedException e) {
-			e.printStackTrace();
-		}
-		return null;
+		ColGroupValue clone = (ColGroupValue) this.clone();
+		clone.setDictionary(newDictionary);
+		clone.setColIndices(colIndexes);
+		return clone;
 	}
 
 	@Override
 	public ColGroupValue copy() {
-		try {
-			ColGroupValue clone = (ColGroupValue) this.clone();
-			return clone;
-		}
-		catch(CloneNotSupportedException e) {
-			e.printStackTrace();
-		}
-		return null;
+		return (ColGroupValue) this.clone();
 	}
 
 	@Override
-	protected AColGroup sliceSingleColumn(int idx) {
+	protected final AColGroup sliceSingleColumn(int idx) {
 		ColGroupValue ret = (ColGroupValue) copy();
 		ret._colIndexes = new int[] {0};
 		if(ret._dict != null)
@@ -583,7 +495,7 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	@Override
-	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+	protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 
 		ColGroupValue ret = (ColGroupValue) copy();
 		ret._dict = ret._dict != null ? ret._dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length) : null;
@@ -593,139 +505,83 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	/**
-	 * Post scale for left Multiplication
-	 * 
-	 * @param dictValues The dictionary values materialized as double array.
-	 * @param vals       The values aggregated from the left side row vector.
-	 * @param c          The output matrix
-	 * @param numVals    The number of values contained in the dictionary.
-	 */
-	protected void postScaling(double[] dictValues, double[] vals, MatrixBlock c, int numVals) {
-		postScaling(dictValues, vals, c, numVals, 0);
-	}
-
-	/**
-	 * Post scale for left Multiplication
-	 * 
-	 * @param dictValues The dictionary values materialized as double array.
-	 * @param vals       The values aggregated from the left side row vector.
-	 * @param c          The output matrix
-	 * @param numVals    The number of values contained in the dictionary.
-	 * @param row        The row index in the output c to assign the result to.
-	 * @param totalCols  The total number of columns in c.
-	 */
-	protected void postScaling(double[] dictValues, double[] vals, MatrixBlock c, int numVals, int row) {
-		final int ncol = getNumCols();
-		int valOff = 0;
-		final double[] cv = c.getDenseBlockValues();
-		final int totalCols = c.getNumColumns();
-
-		for(int k = 0; k < numVals; k++) {
-			double aval = vals[k];
-			for(int j = 0; j < ncol; j++) {
-				int colIx = _colIndexes[j] + row * totalCols;
-				cv[colIx] += aval * dictValues[valOff++];
-			}
-		}
-	}
-
-	/**
-	 * Pre aggregate a vector
+	 * Pre aggregate for left Multiplication
 	 * 
-	 * @param a The vector to aggregate
-	 * @return The pre-aggregated values.
+	 * @param m  The matrixBlock to pre aggregate
+	 * @param rl Start row
+	 * @param ru End row
+	 * @return The Pre aggregated values contained in a MatrixBlock
 	 */
-	public double[] preAggregate(double[] a) {
-		return preAggregate(a, 0);
-	}
-
-	public abstract MatrixBlock preAggregate(MatrixBlock m, int rl, int ru);
-
-	/**
-	 * Pre aggregates for left multiplication
-	 * 
-	 * @param a   The input dense vector or matrix to aggregate
-	 * @param row The row index to aggregate
-	 * @return The pre-aggregated values.
-	 */
-	public abstract double[] preAggregate(double[] a, int row);
-
-	/**
-	 * Pre aggregate for left multiplication
-	 * 
-	 * @param sb The vector to aggregate
-	 * @return The pre-aggregated values.
-	 */
-	public double[] preAggregate(SparseBlock sb) {
-		return preAggregateSparseWithCheck(sb, 0);
-	}
-
-	private double[] preAggregateSparseWithCheck(SparseBlock sb, int row) {
-		if(sb != null && !sb.isEmpty(row))
-			return preAggregateSparse(sb, row);
-		else
-			return null;
+	protected final MatrixBlock preAggregate(MatrixBlock m, int rl, int ru) {
+		final int numVals = getNumValues();
+		final int lhsRows = ru - rl;
+		final double[] vals = allocDVector(lhsRows * numVals, true);
+		final DenseBlock retB = new DenseBlockFP64(new int[] {lhsRows, numVals}, vals);
+		MatrixBlock preAgg = new MatrixBlock(lhsRows, numVals, retB);
+		preAggregate(m, preAgg, rl, ru);
+		preAgg.recomputeNonZeros();
+		return preAgg;
 	}
 
 	/**
-	 * Pre aggregate for left multiplication of sparse vector or matrix.
+	 * Pre aggregate for left Multiplication.
 	 * 
-	 * @param sb  The input sparse vector or matrix to aggregate
-	 * @param row The row index to aggregate
-	 * @return The pre-aggregated values.
+	 * @param m      Matrix to preAggregate
+	 * @param preAgg Matrix to preAggregate into
+	 * @param rl     Start row
+	 * @param ru     End row
 	 */
-	public abstract double[] preAggregateSparse(SparseBlock sb, int row);
+	protected abstract void preAggregate(MatrixBlock m, MatrixBlock preAgg, int rl, int ru);
 
 	public abstract int getIndexStructureHash();
 
-	public IPreAggregate preAggregate(ColGroupValue lhs) {
-		IPreAggregate r = preCallAggregate(lhs);
-		return r;
-	}
-
-	public IPreAggregate preCallAggregate(ColGroupValue lhs) {
-		// LOG.error(lhs.getClass().getSimpleName() + " in " + this.getClass().getSimpleName() + " "
-		// + Arrays.toString(lhs.getColIndices()) + " " + Arrays.toString(this.getColIndices()));
-
-		if(lhs instanceof ColGroupDDC)
-			return preAggregateDDC((ColGroupDDC) lhs);
-		else if(lhs instanceof ColGroupSDC)
-			return preAggregateSDC((ColGroupSDC) lhs);
-		else if(lhs instanceof ColGroupSDCSingle)
-			return preAggregateSDCSingle((ColGroupSDCSingle) lhs);
-		else if(lhs instanceof ColGroupSDCZeros)
-			return preAggregateSDCZeros((ColGroupSDCZeros) lhs);
-		else if(lhs instanceof ColGroupSDCSingleZeros)
-			return preAggregateSDCSingleZeros((ColGroupSDCSingleZeros) lhs);
-		else if(lhs instanceof ColGroupOLE)
-			return preAggregateOLE((ColGroupOLE) lhs);
-		else if(lhs instanceof ColGroupRLE)
-			return preAggregateRLE((ColGroupRLE) lhs);
-		else if(lhs instanceof ColGroupConst)
-			return preAggregateCONST((ColGroupConst) lhs);
+	// private IPreAggregate preAggregate(ColGroupValue lhs) {
+	// IPreAggregate r = preCallAggregate(lhs);
+	// return r;
+	// }
 
-		throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
-			+ this.getClass().getSimpleName());
-	}
+	// private IPreAggregate preCallAggregate(ColGroupValue lhs) {
+	// // (lhs.getClass().getSimpleName() + " in " + this.getClass().getSimpleName() + " "
+	// // + Arrays.toString(lhs.getColIndices()) + " " + Arrays.toString(this.getColIndices()));
+
+	// if(lhs instanceof ColGroupDDC)
+	// return preAggregateDDC((ColGroupDDC) lhs);
+	// else if(lhs instanceof ColGroupSDC)
+	// return preAggregateSDC((ColGroupSDC) lhs);
+	// else if(lhs instanceof ColGroupSDCSingle)
+	// return preAggregateSDCSingle((ColGroupSDCSingle) lhs);
+	// else if(lhs instanceof ColGroupSDCZeros)
+	// return preAggregateSDCZeros((ColGroupSDCZeros) lhs);
+	// else if(lhs instanceof ColGroupSDCSingleZeros)
+	// return preAggregateSDCSingleZeros((ColGroupSDCSingleZeros) lhs);
+	// else if(lhs instanceof ColGroupOLE)
+	// return preAggregateOLE((ColGroupOLE) lhs);
+	// else if(lhs instanceof ColGroupRLE)
+	// return preAggregateRLE((ColGroupRLE) lhs);
+	// else if(lhs instanceof ColGroupConst)
+	// return preAggregateCONST((ColGroupConst) lhs);
+
+	// throw new NotImplementedException("Not supported pre aggregate of :" + lhs.getClass().getSimpleName() + " in "
+	// + this.getClass().getSimpleName());
+	// }
 
-	public IPreAggregate preAggregateCONST(ColGroupConst lhs) {
-		// LOG.error(Arrays.toString(getCounts()));
-		return new ArrPreAggregate(getCounts());
-	}
+	// public IPreAggregate preAggregateCONST(ColGroupConst lhs) {
+	// 	return new ArrPreAggregate(getCounts());
+	// }
 
-	public abstract IPreAggregate preAggregateDDC(ColGroupDDC lhs);
+	// public abstract IPreAggregate preAggregateDDC(ColGroupDDC lhs);
 
-	public abstract IPreAggregate preAggregateSDC(ColGroupSDC lhs);
+	// public abstract IPreAggregate preAggregateSDC(ColGroupSDC lhs);
 
-	public abstract IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs);
+	// public abstract IPreAggregate preAggregateSDCSingle(ColGroupSDCSingle lhs);
 
-	public abstract IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs);
+	// public abstract IPreAggregate preAggregateSDCZeros(ColGroupSDCZeros lhs);
 
-	public abstract IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs);
+	// public abstract IPreAggregate preAggregateSDCSingleZeros(ColGroupSDCSingleZeros lhs);
 
-	public abstract IPreAggregate preAggregateOLE(ColGroupOLE lhs);
+	// public abstract IPreAggregate preAggregateOLE(ColGroupOLE lhs);
 
-	public abstract IPreAggregate preAggregateRLE(ColGroupRLE lhs);
+	// public abstract IPreAggregate preAggregateRLE(ColGroupRLE lhs);
 
 	/**
 	 * Pre aggregate into a dictionary. It is assumed that "that" have more distinct values than, "this".
@@ -734,13 +590,10 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 * @param preModify specifies if the matrix in this
 	 * @return A aggregate dictionary
 	 */
-	public Dictionary preAggregateThatIndexStructure(ColGroupValue that, boolean preModify) {
+	public final Dictionary preAggregateThatIndexStructure(ColGroupValue that, boolean preModify) {
 		int outputLength = that._colIndexes.length * this.getNumValues();
 		Dictionary ret = new Dictionary(new double[outputLength]);
 
-		// if(preModify)
-		// LOG.error(preModify + " " + that.getClass().getSimpleName() + " in " + this.getClass().getSimpleName());
-
 		if(that instanceof ColGroupDDC)
 			return preAggregateThatDDCStructure((ColGroupDDC) that, ret);
 		else if(that instanceof ColGroupSDC)
@@ -775,20 +628,13 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
+	public final void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
 		if(lhs instanceof ColGroupEmpty)
 			return;
 		else if(lhs instanceof ColGroupValue)
 			leftMultByColGroupValue((ColGroupValue) lhs, result);
-		else if(lhs instanceof ColGroupUncompressed) {
-			LOG.warn("Inefficient transpose of uncompressed to fit to "
-				+ "template need t(UnCompressedColGroup) %*% AColGroup support");
-			MatrixBlock ucCG = ((ColGroupUncompressed) lhs).getData();
-			MatrixBlock tmp = new MatrixBlock(ucCG.getNumColumns(), ucCG.getNumRows(), ucCG.isInSparseFormat());
-			LibMatrixReorg.transpose(ucCG, tmp, InfrastructureAnalyzer.getLocalParallelism());
-			leftMultByMatrix(tmp, result);
-
-		}
+		else if(lhs instanceof ColGroupUncompressed)
+			leftMultByUncompressedColGroup((ColGroupUncompressed) lhs, result);
 		else
 			throw new DMLCompressionException(
 				"Not supported left multiplication with A ColGroup of type: " + lhs.getClass().getSimpleName());
@@ -812,15 +658,14 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 			}
 			else
 				matrixMultDictionariesAndOutputToColIndexesWithScaling(lhs._dict, this._dict, lhs._colIndexes,
-					this._colIndexes, resV, numCols, getCounts());
-
+					this._colIndexes, result, getCounts());
 		}
 		else if(lhs instanceof ColGroupConst || this instanceof ColGroupConst) {
 			ADictionary r = this instanceof ColGroupConst ? this._dict : new Dictionary(
 				this._dict.colSum(getCounts(), rCol));
 			ADictionary l = lhs instanceof ColGroupConst ? lhs._dict : new Dictionary(
 				lhs._dict.colSum(lhs.getCounts(), lCol));
-			matrixMultDictionariesAndOutputToColIndexes(l, r, lhs._colIndexes, this._colIndexes, resV, numCols);
+			matrixMultDictionariesAndOutputToColIndexes(l, r, lhs._colIndexes, this._colIndexes, result);
 		}
 		else {
 			int[] countsRight = getCounts();
@@ -841,8 +686,8 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 				ColGroupValue thisM = (mct != null) ? (ColGroupValue) this
 					.copyAndSet(this._dict.subtractTuple(mct)) : this;
 				Dictionary preAgg = lhs.preAggregateThatIndexStructure(thisM, true);
-				matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, this._colIndexes, resV,
-					numCols);
+				matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, this._colIndexes,
+					result);
 			}
 			else if(skipLeft > threshold && !(lhs instanceof ColGroupDDC)) {
 				double[] mct = lhs._dict.getMostCommonTuple(lhs.getCounts(), lCol);
@@ -852,43 +697,67 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 
 				ColGroupValue lhsM = (mct != null) ? (ColGroupValue) lhs.copyAndSet(lhs._dict.subtractTuple(mct)) : lhs;
 				Dictionary preAgg = this.preAggregateThatIndexStructure(lhsM, true);
-				matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, lhs._colIndexes, this._colIndexes, resV,
-					numCols);
+				matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, lhs._colIndexes, this._colIndexes,
+					result);
 			}
 			else if(nvR * rCol < nvL * lCol) {
 				Dictionary preAgg = lhs.preAggregateThatIndexStructure(this, false);
-				matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, this._colIndexes, resV,
-					numCols);
+				matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, this._colIndexes,
+					result);
 			}
 			else {
 				Dictionary preAgg = this.preAggregateThatIndexStructure(lhs, false);
-				matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, lhs._colIndexes, this._colIndexes, resV,
-					numCols);
+				matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, lhs._colIndexes, this._colIndexes,
+					result);
 			}
 		}
 	}
 
+	private void leftMultByUncompressedColGroup(ColGroupUncompressed lhs, MatrixBlock result) {
+		MatrixBlock ucCG = lhs.getData();
+		if(this instanceof ColGroupConst) {
+			AggregateUnaryOperator auop = InstructionUtils.parseBasicAggregateUnaryOperator("uac+", 1);
+			MatrixBlock tmp = ucCG.aggregateUnaryOperations(auop, new MatrixBlock(),
+				Math.max(ucCG.getNumRows(), ucCG.getNumColumns()), null, true);
+			ADictionary l = new MatrixBlockDictionary(tmp);
+			matrixMultDictionariesAndOutputToColIndexes(l, _dict, lhs._colIndexes, _colIndexes, result);
+		}
+		else {
+			LOG.warn("Inefficient transpose of uncompressed to fit to "
+				+ "template need t(UnCompressedColGroup) %*% AColGroup support");
+			MatrixBlock tmp = new MatrixBlock(ucCG.getNumColumns(), ucCG.getNumRows(), ucCG.isInSparseFormat());
+			LibMatrixReorg.transpose(ucCG, tmp, InfrastructureAnalyzer.getLocalParallelism());
+
+			leftMultByMatrix(tmp, result, lhs._colIndexes);
+		}
+	}
+
 	@Override
-	public void tsmm(double[] result, int numColumns) {
+	public final void tsmm(double[] result, int numColumns) {
 
-		// final int[] counts = getCounts();
+		final int[] counts = getCounts();
 
-		// _dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
-		// if(_dict instanceof MatrixBlockDictionary) {
-		// 	MatrixBlockDictionary mbd = (MatrixBlockDictionary) _dict;
-		// 	MatrixBlock mb = mbd.getMatrixBlock();
-		// 	if(mb.isEmpty())
-		// 		return;
-		// 	else if(mb.isInSparseFormat())
-		// 		tsmmSparse(result, numColumns, mb.getSparseBlock(), counts);
-		// 	else
-		// 		tsmmDense(result, numColumns, mb.getDenseBlockValues(), counts);
-		// }
-		// else
-		// 	tsmmDense(result, numColumns, getValues(), counts);
+		_dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
+		if(_dict instanceof MatrixBlockDictionary) {
+			MatrixBlockDictionary mbd = (MatrixBlockDictionary) _dict;
+			MatrixBlock mb = mbd.getMatrixBlock();
+			if(mb.isEmpty())
+				return;
+			else if(mb.isInSparseFormat())
+				tsmmSparse(result, numColumns, mb.getSparseBlock(), counts);
+			else
+				tsmmDense(result, numColumns, mb.getDenseBlockValues(), counts);
+		}
+		else
+			tsmmDense(result, numColumns, getValues(), counts);
 
 	}
 
+	@Override
+	public final void tsmm(double[] result, int numColumns, int idxStart, int idxEnd) {
+		throw new NotImplementedException();
+	}
+
 	private void tsmmDense(double[] result, int numColumns, double[] values, int[] counts) {
 		if(values == null)
 			return;
@@ -926,24 +795,24 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	@Override
-	public boolean containsValue(double pattern) {
+	public final boolean containsValue(double pattern) {
 		return _dict.containsValue(pattern);
 	}
 
 	@Override
-	public long getNumberNonZeros() {
+	public final long getNumberNonZeros() {
 		int[] counts = getCounts();
 		return _dict.getNumberNonZeros(counts, _colIndexes.length);
 	}
 
 	private static void matrixMultDictionariesAndOutputToColIndexesWithScaling(final ADictionary left,
-		final ADictionary right, final int[] leftRows, final int[] rightColumns, final double[] result,
-		final int outCols, final int[] counts) {
+		final ADictionary right, final int[] leftRows, final int[] rightColumns, final MatrixBlock result,
+		final int[] counts) {
 		final boolean modifyRight = right.getInMemorySize() > left.getInMemorySize();
 		ADictionary rightM = modifyRight ? right.scaleTuples(counts, rightColumns.length) : right;
 		ADictionary leftM = modifyRight ? left : left.scaleTuples(counts, leftRows.length);
 
-		matrixMultDictionariesAndOutputToColIndexes(leftM, rightM, leftRows, rightColumns, result, outCols);
+		matrixMultDictionariesAndOutputToColIndexes(leftM, rightM, leftRows, rightColumns, result);
 
 	}
 
@@ -1020,11 +889,10 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 * @param right     The right side dictionary
 	 * @param rowsLeft  The number of rows and the row indexes on the left hand side
 	 * @param colsRight The number of columns and the column indexes on the right hand side
-	 * @param result    The result matrix to put the results into, linearized row major
-	 * @param outCols   The output columns count, to know how much to offset into with results.
+	 * @param result    The result matrix to put the results into.
 	 */
 	private static void matrixMultDictionariesAndOutputToColIndexes(ADictionary left, ADictionary right, int[] rowsLeft,
-		int[] colsRight, double[] result, int outCols) {
+		int[] colsRight, MatrixBlock result) {
 
 		try {
 			double[] leftV = null;
@@ -1033,22 +901,36 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 			if(left instanceof MatrixBlockDictionary) {
 				MatrixBlockDictionary leftD = left.getAsMatrixBlockDictionary(rowsLeft.length);
 				MatrixBlock leftMB = leftD.getMatrixBlock();
-				if(leftMB.isEmpty())
+				if(leftMB.isEmpty()) {
+					LOG.error("Left is empty: " + leftMB);
 					return;
+				}
 				else if(right instanceof MatrixBlockDictionary) {
 					MatrixBlockDictionary rightD = right.getAsMatrixBlockDictionary(colsRight.length);
 					MatrixBlock rightMB = rightD.getMatrixBlock();
 					if(rightMB.isEmpty())
 						return;
-					else if(rightMB.isInSparseFormat() && leftMB.isInSparseFormat()) {
+					else if(rightMB.isInSparseFormat() && leftMB.isInSparseFormat())
 						throw new NotImplementedException("Not Supported sparse sparse dictionary multiplication");
-					}
+					else if(rightMB.isInSparseFormat())
+						matrixMultDictionariesAndOutputToColIndecesDenseSparse(leftMB.getDenseBlockValues(),
+							rightMB.getSparseBlock(), rowsLeft, colsRight, result);
+					else if(leftMB.isInSparseFormat())
+						matrixMultDictionariesAndOutputToColIndecesSparseDense(leftMB.getSparseBlock(),
+							rightMB.getDenseBlockValues(), rowsLeft, colsRight, result);
+					else
+						matrixMultDictionariesAndOutputToColIndexesDenseDense(leftMB.getDenseBlockValues(),
+							rightMB.getDenseBlockValues(), rowsLeft, colsRight, result);
+					return;
 				}
 				else if(leftMB.isInSparseFormat()) {
 					matrixMultDictionariesAndOutputToColIndecesSparseDense(leftMB.getSparseBlock(), right.getValues(),
-						rowsLeft, colsRight, result, outCols);
+						rowsLeft, colsRight, result);
 					return;
 				}
+				else {
+					leftV = leftMB.getDenseBlockValues();
+				}
 			}
 			else {
 				leftV = left.getValues();
@@ -1058,24 +940,25 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 				MatrixBlockDictionary rightD = right.getAsMatrixBlockDictionary(colsRight.length);
 				MatrixBlock rightMB = rightD.getMatrixBlock();
 
-				if(rightMB.isEmpty())
+				if(rightMB.isEmpty()) {
+					LOG.error("Right is empty: " + rightMB);
 					return;
+				}
 				else if(rightMB.isInSparseFormat()) {
 					matrixMultDictionariesAndOutputToColIndecesDenseSparse(leftV, rightMB.getSparseBlock(), rowsLeft,
-						colsRight, result, outCols);
+						colsRight, result);
 					return;
 				}
+				else {
+					rightV = rightMB.getDenseBlockValues();
+				}
 			}
 			else {
 				rightV = right.getValues();
 			}
 
-			if(leftV != null && rightV != null) {
-				// default if there was not sparse found;
-				LOG.warn("Inefficient forced dense values");
-				matrixMultDictionariesAndOutputToColIndexesDenseDense(leftV, rightV, rowsLeft, colsRight, result,
-					outCols);
-			}
+			if(leftV != null && rightV != null)
+				matrixMultDictionariesAndOutputToColIndexesDenseDense(leftV, rightV, rowsLeft, colsRight, result);
 
 		}
 		catch(Exception e) {
@@ -1088,25 +971,29 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	}
 
 	private static void matrixMultDictionariesAndOutputToColIndexesDenseDense(double[] left, double[] right,
-		int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+		int[] rowsLeft, int[] colsRight, MatrixBlock result) {
 		final int commonDim = Math.min(left.length / rowsLeft.length, right.length / colsRight.length);
+
+		final double[] resV = result.getDenseBlockValues();
 		for(int k = 0; k < commonDim; k++) {
 			final int offL = k * rowsLeft.length;
 			final int offR = k * colsRight.length;
 			for(int i = 0; i < rowsLeft.length; i++) {
-				final int offOut = rowsLeft[i] * outCols;
+				final int offOut = rowsLeft[i] * result.getNumColumns();
 				final double vl = left[offL + i];
 				if(vl != 0)
 					for(int j = 0; j < colsRight.length; j++) {
 						final double vr = right[offR + j];
-						result[offOut + colsRight[j]] += vl * vr;
+						resV[offOut + colsRight[j]] += vl * vr;
 					}
 			}
 		}
 	}
 
 	private static void matrixMultDictionariesAndOutputToColIndecesSparseDense(SparseBlock left, double[] right,
-		int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+		int[] rowsLeft, int[] colsRight, MatrixBlock result) {
+
+		final double[] resV = result.getDenseBlockValues();
 		final int commonDim = Math.min(left.numRows(), right.length / colsRight.length);
 		for(int i = 0; i < commonDim; i++) {
 			if(left.isEmpty(i))
@@ -1117,16 +1004,17 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 			final double[] leftVals = left.values(i);
 			final int offRight = i * colsRight.length;
 			for(int k = apos; k < alen; k++) {
-				final int offOut = rowsLeft[aix[k]] * outCols;
+				final int offOut = rowsLeft[aix[k]] * result.getNumColumns();
 				final double v = leftVals[k];
 				for(int j = 0; j < colsRight.length; j++)
-					result[offOut + colsRight[j]] += v * right[offRight + j];
+					resV[offOut + colsRight[j]] += v * right[offRight + j];
 			}
 		}
 	}
 
 	private static void matrixMultDictionariesAndOutputToColIndecesDenseSparse(double[] left, SparseBlock right,
-		int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+		int[] rowsLeft, int[] colsRight, MatrixBlock result) {
+		final double[] resV = result.getDenseBlockValues();
 		final int commonDim = Math.min(left.length / rowsLeft.length, right.numRows());
 		for(int i = 0; i < commonDim; i++) {
 			if(right.isEmpty(i))
@@ -1137,18 +1025,18 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 			final double[] rightVals = right.values(i);
 			final int offLeft = i * rowsLeft.length;
 			for(int j = 0; j < rowsLeft.length; j++) {
-				final int offOut = rowsLeft[j] * outCols;
+				final int offOut = rowsLeft[j] * result.getNumColumns();
 				final double v = left[offLeft + j];
 				if(v != 0)
 					for(int k = apos; k < alen; k++) {
-						result[offOut + colsRight[aix[k]]] += v * rightVals[k];
+						resV[offOut + colsRight[aix[k]]] += v * rightVals[k];
 					}
 			}
 		}
 	}
 
 	@Override
-	public boolean isDense() {
+	public final boolean isDense() {
 		return !_zeros;
 	}
 
@@ -1157,69 +1045,145 @@ public abstract class ColGroupValue extends ColGroupCompressed implements Clonea
 	 * 
 	 * @param matrix matrix to left multiply
 	 * @param result matrix block result
-	 * @param values The materialized values contained in the ColGroupValue
 	 * @param rl     The row to start the matrix multiplication from
 	 * @param ru     The row to stop the matrix multiplication at.
 	 */
 	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		final int numVals = getNumValues();
+	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
+		try {
+			if(matrix.isEmpty())
+				return;
+
+			MatrixBlock tmpRes = leftMultByMatrixIntermediateMatrix(matrix, rl, ru);
+
+			addMatrixToResult(tmpRes, result, rl, ru);
+
+		}
+		catch(Exception e) {
+			throw new DMLCompressionException(this.getClass().getSimpleName() + " Failed to Left Matrix Multiply", e);
+		}
+	}
+
+	private MatrixBlock leftMultByMatrixIntermediateMatrix(MatrixBlock matrix, int rl, int ru) {
+		// Get dictionary.
+		MatrixBlock dictM = forceMatrixBlockDictionary().getMatrixBlock();
+		// Allocate temporary matrix to multiply into.
+		MatrixBlock tmpRes = new MatrixBlock(matrix.getNumRows(), _colIndexes.length, false);
+		// Pre aggregate the matrix into same size as dictionary
+		MatrixBlock preAgg = preAggregate(matrix, rl, ru);
+
+		LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
+		return tmpRes;
+	}
+
+	private void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int[] outputRows) {
+		try {
+			if(matrix.isEmpty())
+				return;
+			MatrixBlock tmpRes = leftMultByMatrixIntermediateMatrix(matrix, 0, matrix.getNumRows());
+			addMatrixToResult(tmpRes, result, outputRows);
+
+		}
+		catch(Exception e) {
+			throw new DMLCompressionException(
+				this.getClass().getSimpleName() + " Failed to multiply with an uncompressed column group", e);
+		}
+	}
+
+	private MatrixBlockDictionary forceMatrixBlockDictionary() {
 		if(!(_dict instanceof MatrixBlockDictionary))
 			_dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
+		return((MatrixBlockDictionary) _dict);
+	}
 
-		MatrixBlock dictM = ((MatrixBlockDictionary) _dict).getMatrixBlock();
-		dictM.examSparsity();
-		MatrixBlock tmpRes = new MatrixBlock(1, _colIndexes.length, false);
-		for(int i = rl; i < ru; i++) {
-			double[] vals = matrix.isInSparseFormat() ? preAggregateSparseWithCheck(matrix.getSparseBlock(),
-				i) : preAggregate(matrix.getDenseBlockValues(), i);
-			if(vals != null) {
-				DenseBlock preAggV = new DenseBlockFP64(new int[] {1, numVals}, vals);
-				MatrixBlock preAgg = new MatrixBlock(1, numVals, preAggV);
-				preAgg.setNonZeros(numVals);
-				// LOG.error("PreAgg Sparsity " + preAgg.getSparsity() + " nnz " + preAgg.getNonZeros());
-				LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
-				addVectorToResult(tmpRes, result, i);
-				tmpRes.reset();
+	private void addMatrixToResult(MatrixBlock tmp, MatrixBlock result, int rl, int ru) {
+		if(tmp.isEmpty())
+			return;
+		final double[] retV = result.getDenseBlockValues();
+		final int nColRet = result.getNumColumns();
+		if(tmp.isInSparseFormat()) {
+			SparseBlock sb = tmp.getSparseBlock();
+			for(int row = rl, offT = 0; row < ru; row++, offT++) {
+				final int apos = sb.pos(offT);
+				final int alen = sb.size(offT);
+				final int[] aix = sb.indexes(offT);
+				final double[] avals = sb.values(offT);
+				final int offR = row * nColRet;
+				for(int i = apos; i < apos + alen; i++) {
+					retV[offR + _colIndexes[aix[i]]] += avals[i];
+				}
+			}
+		}
+		else {
+			final double[] tmpV = tmp.getDenseBlockValues();
+			final int nCol = _colIndexes.length;
+			for(int row = rl, offT = 0; row < ru; row++, offT += nCol) {
+				final int offR = row * nColRet;
+				for(int col = 0; col < nCol; col++) {
+					retV[offR + _colIndexes[col]] += tmpV[offT + col];
+				}
 			}
 		}
 	}
 
-	private void addVectorToResult(MatrixBlock tmp, MatrixBlock result, int row) {
+	private void addMatrixToResult(MatrixBlock tmp, MatrixBlock result, int[] rowIndexes) {
 		if(tmp.isEmpty())
 			return;
 		final double[] retV = result.getDenseBlockValues();
 		final int nColRet = result.getNumColumns();
-		final int offR = row * nColRet;
 		if(tmp.isInSparseFormat()) {
-			final SparseBlock sb = tmp.getSparseBlock();
-			if(sb.isEmpty(0))
-				return;
-			final int apos = sb.pos(0);
-			final int alen = sb.size(0);
-			final int[] aix = sb.indexes(0);
-			final double[] avals = sb.values(0);
-			for(int i = apos; i < apos + alen; i++)
-				retV[offR + _colIndexes[aix[i]]] += avals[i];
-
+			SparseBlock sb = tmp.getSparseBlock();
+			for(int row = 0; row < rowIndexes.length; row++) {
+				final int apos = sb.pos(row);
+				final int alen = sb.size(row);
+				final int[] aix = sb.indexes(row);
+				final double[] avals = sb.values(row);
+				final int offR = rowIndexes[row] * nColRet;
+				for(int i = apos; i < apos + alen; i++) {
+					retV[offR + _colIndexes[aix[i]]] += avals[i];
+				}
+			}
 		}
 		else {
 			final double[] tmpV = tmp.getDenseBlockValues();
-			// final int nColTmp = tmp.getNumColumns();
-			// for(int row = rl, offT = 0, offR = rl * nColRet; row < ru; row++, offT += nColTmp, offR += nColRet) {
-			for(int col = 0; col < _colIndexes.length; col++)
-				retV[offR + _colIndexes[col]] += tmpV[col];
-
-			// }
+			final int nCol = _colIndexes.length;
+			for(int row = 0, offT = 0; row < rowIndexes.length; row++, offT += nCol) {
+				final int offR = rowIndexes[row] * nColRet;
+				for(int col = 0; col < nCol; col++) {
+					retV[offR + _colIndexes[col]] += tmpV[offT + col];
+				}
+			}
 		}
 	}
 
-	public AColGroup rightMultByMatrix(MatrixBlock right) {
-		Pair<int[], double[]> pre = preaggForRightMultiplyValues(getNumValues(), right, 0, right.getNumColumns(),
-			right.getNumColumns());
-		if(pre.getLeft().length > 0)
-			return copyAndSet(pre.getLeft(), pre.getRight());
-		return null;
+	public final AColGroup rightMultByMatrix(MatrixBlock right) {
+
+		if(right.isEmpty())
+			return null;
+		final int cl = 0;
+		final int cu = right.getNumColumns();
+		final int cut = right.getNumColumns();
+		final int nCol = right.getNumColumns();
+		final int numVals = getNumValues();
+		int[] agCols;
+		double[] ret;
+		if(right.isInSparseFormat()) {
+			final SparseBlock sb = right.getSparseBlock();
+			agCols = getAggregateColumnsSetSparse(sb, nCol);
+			if(agCols.length == 0)
+				return null;
+			ret = preaggValuesFromSparse(numVals, sb, agCols, cl, cu, cut);
+		}
+		else {
+			double[] rightV = right.getDenseBlockValues();
+			agCols = getAggregateColumnsSetDense(rightV, cl, cu, cut);
+			if(agCols.length == 0)
+				return null;
+			ret = new double[numVals * agCols.length];
+			_dict.preaggValuesFromDense(numVals, _colIndexes, agCols, rightV, ret, cut);
+		}
+
+		return copyAndSet(agCols, ret);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index e1d4152..df6f648 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -29,8 +29,7 @@ import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
- * This dictionary class aims to encapsulate the storage and operations over unique floating point values of a column
- * group.
+ * This dictionary class aims to encapsulate the storage and operations over unique tuple values of a column group.
  */
 public abstract class ADictionary {
 
@@ -52,15 +51,6 @@ public abstract class ADictionary {
 	public abstract double getValue(int i);
 
 	/**
-	 * Determines if the content has a zero tuple. meaning all values at a specific row are zero value. This is useful
-	 * information to find out if the dictionary is used in a dense context. To improve some specific operations.
-	 * 
-	 * @param nCol The number of columns in the dictionary.
-	 * @return The index at which the zero tuple is located.
-	 */
-	public abstract int hasZeroTuple(int nCol);
-
-	/**
 	 * Returns the memory usage of the dictionary.
 	 * 
 	 * @return a long value in number of bytes for the dictionary.
@@ -87,13 +77,6 @@ public abstract class ADictionary {
 	public abstract double[] aggregateTuples(Builtin fn, int nCol);
 
 	/**
-	 * returns the count of values contained in the dictionary.
-	 * 
-	 * @return an integer of count of values.
-	 */
-	public abstract int size();
-
-	/**
 	 * Applies the scalar operation on the dictionary. Note that this operation modifies the underlying data, and
 	 * normally require a copy of the original Dictionary to preserve old objects.
 	 * 
@@ -113,15 +96,46 @@ public abstract class ADictionary {
 	 */
 	public abstract ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols);
 
+	/**
+	 * Apply binary row operation on this dictionary.
+	 * 
+	 * @param op         The operation to this dictionary
+	 * @param v          The values to use on the left hand side.
+	 * @param sparseSafe boolean specifying if the operation is safe, and therefore dont need to allocate an extended
+	 *                   dictionary
+	 * @param colIndexes The column indexes to consider inside v.
+	 * @param left       A Boolean specifying if the operation is done on the left or right side of the dictionary.
+	 * @return A new dictionary containing the updated values.
+	 */
 	public ADictionary applyBinaryRowOp(BinaryOperator op, double[] v, boolean sparseSafe, int[] colIndexes,
 		boolean left) {
 		return (left) ? applyBinaryRowOpLeft(op, v, sparseSafe, colIndexes) : applyBinaryRowOpRight(op, v, sparseSafe,
 			colIndexes);
 	}
 
+	/**
+	 * Apply binary row operation on this dictionary on the left side.
+	 * 
+	 * @param op         The operation to this dictionary
+	 * @param v          The values to use on the left hand side.
+	 * @param sparseSafe boolean specifying if the operation is safe, and therefore dont need to allocate an extended
+	 *                   dictionary
+	 * @param colIndexes The column indexes to consider inside v.
+	 * @return A new dictionary containing the updated values.
+	 */
 	public abstract ADictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, boolean sparseSafe,
 		int[] colIndexes);
 
+	/**
+	 * Apply binary row operation on this dictionary on the right side.
+	 * 
+	 * @param op         The operation to this dictionary
+	 * @param v          The values to use on the right hand side.
+	 * @param sparseSafe boolean specifying if the operation is safe, and therefore dont need to allocate an extended
+	 *                   dictionary
+	 * @param colIndexes The column indexes to consider inside v.
+	 * @return A new dictionary containing the updated values.
+	 */
 	public abstract ADictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, boolean sparseSafe,
 		int[] colIndexes);
 
@@ -130,6 +144,12 @@ public abstract class ADictionary {
 	 */
 	public abstract ADictionary clone();
 
+	/**
+	 * Clone the dictionary, and extend size of the dictionary by a given length
+	 * 
+	 * @param len The length to extend the dictionary, it is assumed this value is positive.
+	 * @return a clone of the dictionary, extended by len.
+	 */
 	public abstract ADictionary cloneAndExtend(int len);
 
 	/**
@@ -173,14 +193,6 @@ public abstract class ADictionary {
 	public abstract int getNumberOfValues(int ncol);
 
 	/**
-	 * Materializes a Zero tuple at the last index of the dictionary.
-	 * 
-	 * @param numCols The number of columns in the dictionary
-	 * @return the new Dictionary with materialized zero tuple.
-	 */
-	// public abstract IDictionary materializeZeroValue(int numCols);
-
-	/**
 	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
 	 * 
 	 * Note if the number of columns is one the actual dictionaries values are simply returned.
@@ -201,14 +213,50 @@ public abstract class ADictionary {
 	 */
 	public abstract double sumRow(int k, boolean square, int nrColumns);
 
+	/**
+	 * get the column sum of this dictionary only.
+	 * 
+	 * @param counts the counts of the values contained
+	 * @param nCol   The number of columns contained in each tuple.
+	 * @return the colSums of this column group.
+	 */
 	public abstract double[] colSum(int[] counts, int nCol);
 
+	/**
+	 * Get the column sum of the values contained in the dictionary
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 * @param square     Specify if the values should be squared
+	 */
 	public abstract void colSum(double[] c, int[] counts, int[] colIndexes, boolean square);
 
-	public abstract double sum(int[] counts, int ncol);
+	/**
+	 * Get the sum of the values contained in the dictionary
+	 * 
+	 * @param counts The counts of the individual tuples
+	 * @param nCol   The number of columns contained
+	 * @return The sum scaled by the counts provided.
+	 */
+	public abstract double sum(int[] counts, int nCol);
 
-	public abstract double sumsq(int[] counts, int ncol);
+	/**
+	 * Get the square sum of the values contained in the dictionary
+	 * 
+	 * @param counts The counts of the individual tuples
+	 * @param nCol   The number of columns contained
+	 * @return The square sum scaled by the counts provided.
+	 */
+	public abstract double sumsq(int[] counts, int nCol);
 
+	/**
+	 * Get a string representation of the dictionary, that considers the layout of the data.
+	 * 
+	 * @param colIndexes The number of columns in the dictionary.
+	 * @return A string that is nicer to print.
+	 */
 	public abstract String getString(int colIndexes);
 
 	/**
@@ -239,11 +287,17 @@ public abstract class ADictionary {
 	 */
 	public abstract ADictionary reExpandColumns(int max);
 
+	/**
+	 * Detect if the dictionary contains a specific value.
+	 * 
+	 * @param pattern The value to search for
+	 * @return true if the value is contained else false.
+	 */
 	public abstract boolean containsValue(double pattern);
 
 	/**
 	 * Calculate the number of non zeros in the dictionary. The number of non zeros should be scaled with the counts
-	 * given
+	 * given. This gives the exact number of non zero values in the parent column group.
 	 * 
 	 * @param counts The counts of each dictionary entry
 	 * @param nCol   The number of columns in this dictionary
@@ -251,8 +305,6 @@ public abstract class ADictionary {
 	 */
 	public abstract long getNumberNonZeros(int[] counts, int nCol);
 
-	public abstract long getNumberNonZerosContained();
-
 	/**
 	 * Copies and adds the dictionary entry from this dictionary to the d dictionary
 	 * 
@@ -269,6 +321,7 @@ public abstract class ADictionary {
 	 * returns null if that tuple is all zero values.
 	 * 
 	 * @param counts The counts of the individual tuples contained, managed by the column group.
+	 * @param nCol   The number of columns contained in this dictionary
 	 * @return a new double array containing the most common value
 	 */
 	public double[] getMostCommonTuple(int[] counts, int nCol) {
@@ -283,6 +336,13 @@ public abstract class ADictionary {
 		return getTuple(maxIndex, nCol);
 	}
 
+	/**
+	 * Get the values contained in a specific tuple of the dictionary.
+	 * 
+	 * @param index The index where the values are located
+	 * @param nCol  The number of columns contained in this dictionary
+	 * @return a materialized double array containing the tuple.
+	 */
 	public abstract double[] getTuple(int index, int nCol);
 
 	/**
@@ -298,6 +358,7 @@ public abstract class ADictionary {
 	 * Get this dictionary as a matrixBlock dictionary. This allows us to use optimized kernels coded elsewhere in the
 	 * system, such as matrix multiplication.
 	 * 
+	 * @param nCol The number of columns contained in this column group.
 	 * @return A Dictionary containing a MatrixBlock.
 	 */
 	public abstract MatrixBlockDictionary getAsMatrixBlockDictionary(int nCol);
@@ -310,4 +371,18 @@ public abstract class ADictionary {
 	 * @return A New dictionary (since we don't want to modify the underlying dictionary)
 	 */
 	public abstract ADictionary scaleTuples(int[] scaling, int nCol);
+
+	/**
+	 * Pre Aggregate values for right Matrix Multiplication.
+	 * 
+	 * @param numVals          The number of values contained in this dictionary
+	 * @param colIndexes       The column indexes that is associated with the parent column group
+	 * @param aggregateColumns The column to aggregate, this is preprocessed, to find remove consideration for empty
+	 *                         columns
+	 * @param b                The values in the right hand side matrix
+	 * @param ret              The double array to put in the aggregate.
+	 * @param cut              The number of columns in b.
+	 */
+	public abstract void preaggValuesFromDense(final int numVals, final int[] colIndexes, final int[] aggregateColumns,
+		final double[] b, final double[] ret, final int cut);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 770557b..80872d5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -71,19 +71,6 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
-	public int hasZeroTuple(int nCol) {
-		int len = getNumberOfValues(nCol);
-		for(int i = 0, off = 0; i < len; i++, off += nCol) {
-			boolean allZeros = true;
-			for(int j = 0; j < nCol; j++)
-				allZeros &= (_values[off + j] == 0);
-			if(allZeros)
-				return i;
-		}
-		return -1;
-	}
-
-	@Override
 	public double aggregate(double init, Builtin fn) {
 		// full aggregate can disregard tuple boundaries
 		double ret = init;
@@ -122,7 +109,7 @@ public class Dictionary extends ADictionary {
 		double[] values = new double[_values.length + numCols];
 		for(int i = 0; i < _values.length; i++)
 			values[i] = op.executeScalar(_values[i]);
-		
+
 		Arrays.fill(values, _values.length, _values.length + numCols, newVal);
 		return new Dictionary(values);
 	}
@@ -208,13 +195,13 @@ public class Dictionary extends ADictionary {
 		return 1 + 4 + 8 * size();
 	}
 
-	public int size() {
-		return (_values == null) ? 0 : _values.length;
+	private int size() {
+		return _values.length;
 	}
 
 	@Override
 	public int getNumberOfValues(int nCol) {
-		return (_values == null) ? 0 : _values.length / nCol;
+		return _values.length / nCol;
 	}
 
 	@Override
@@ -234,8 +221,7 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public double sumRow(int k, boolean square, int nrColumns) {
-		if(_values == null)
-			return 0;
+
 		int valOff = k * nrColumns;
 		double res = 0.0;
 		if(!square) {
@@ -265,8 +251,6 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
-		if(_values == null)
-			return;
 		for(int k = 0; k < _values.length / colIndexes.length; k++) {
 			final int cntk = counts[k];
 			for(int j = 0; j < colIndexes.length; j++) {
@@ -282,8 +266,6 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public double sum(int[] counts, int ncol) {
-		if(_values == null)
-			return 0;
 		double out = 0;
 		int valOff = 0;
 		for(int k = 0; k < _values.length / ncol; k++) {
@@ -297,8 +279,6 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public double sumsq(int[] counts, int ncol) {
-		if(_values == null)
-			return 0;
 		double out = 0;
 		int valOff = 0;
 		for(int k = 0; k < _values.length / ncol; k++) {
@@ -322,8 +302,7 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public void addMaxAndMin(double[] ret, int[] colIndexes) {
-		if(_values == null || _values.length == 0)
-			return;
+
 		double[] mins = new double[colIndexes.length];
 		double[] maxs = new double[colIndexes.length];
 		for(int i = 0; i < colIndexes.length; i++) {
@@ -347,15 +326,14 @@ public class Dictionary extends ADictionary {
 		if(colIndexes == 1)
 			sb.append(Arrays.toString(_values));
 		else {
-			sb.append("[");
+			sb.append("[\n");
 			for(int i = 0; i < _values.length - 1; i++) {
 				sb.append(_values[i]);
-				sb.append((i) % (colIndexes) == colIndexes - 1 ? "\n: " : ", ");
-			}
-			if(_values != null && _values.length > 0) {
-				sb.append(_values[_values.length - 1]);
+				sb.append((i) % (colIndexes) == colIndexes - 1 ? "\nt" + i + ": " : ", ");
 			}
-			sb.append("]");
+			sb.append(_values[_values.length - 1]);
+
+			sb.append("\n]");
 		}
 		return sb.toString();
 	}
@@ -388,10 +366,6 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public boolean containsValue(double pattern) {
-
-		if(_values == null)
-			return false;
-
 		boolean NaNpattern = Double.isNaN(pattern);
 
 		if(NaNpattern) {
@@ -440,16 +414,6 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
-	public long getNumberNonZerosContained() {
-		long count = 0;
-		for(double v : _values) {
-			if(v != 0.0)
-				count++;
-		}
-		return count;
-	}
-
-	@Override
 	public double[] getTuple(int index, int nCol) {
 
 		final double[] tuple = new double[nCol];
@@ -457,7 +421,7 @@ public class Dictionary extends ADictionary {
 		for(int i = index * nCol, off = 0; i < (index + 1) * nCol && i < _values.length; i++, off++) {
 			final double v = _values[i];
 			if(v != 0) {
-				tuple[off] = _values[i];
+				tuple[off] = v;
 				allZero = false;
 			}
 		}
@@ -479,6 +443,7 @@ public class Dictionary extends ADictionary {
 		final int nRow = _values.length / nCol;
 		DenseBlock dictV = new DenseBlockFP64(new int[] {nRow, nCol}, _values);
 		MatrixBlock dictM = new MatrixBlock(nRow, nCol, dictV);
+		dictM.getNonZeros();
 		dictM.examSparsity();
 		return new MatrixBlockDictionary(dictM);
 	}
@@ -506,4 +471,20 @@ public class Dictionary extends ADictionary {
 		}
 		return new Dictionary(scaledValues);
 	}
+
+	@Override
+	public void preaggValuesFromDense(int numVals, int[] colIndexes, int[] aggregateColumns, double[] b, double[] ret,
+		int cut) {
+		for(int k = 0, off = 0;
+			k < numVals * colIndexes.length;
+			k += colIndexes.length, off += aggregateColumns.length) {
+			for(int h = 0; h < colIndexes.length; h++) {
+				int idb = colIndexes[h] * cut;
+				double v = _values[k + h];
+				if(v != 0)
+					for(int i = 0; i < aggregateColumns.length; i++)
+						ret[off + i] += v * b[idb + aggregateColumns[i]];
+			}
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index 1fd47d8..c8bc14c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -31,6 +31,7 @@ import org.apache.sysds.runtime.compress.utils.Bitmap;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
 import org.apache.sysds.runtime.compress.utils.MultiColBitmap;
 import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public class DictionaryFactory {
@@ -69,6 +70,10 @@ public class DictionaryFactory {
 		return create(ubm, 1.0);
 	}
 
+	public static ADictionary create(ABitmap ubm, double sparsity, boolean withZeroTuple) {
+		return (withZeroTuple) ? createWithAppendedZeroTuple(ubm, sparsity) : create(ubm, sparsity);
+	}
+
 	public static ADictionary create(ABitmap ubm, double sparsity) {
 		if(ubm instanceof BitmapLossy)
 			return new QDictionary((BitmapLossy) ubm);
@@ -111,7 +116,6 @@ public class DictionaryFactory {
 	}
 
 	public static ADictionary createWithAppendedZeroTuple(ABitmap ubm, double sparsity) {
-		// Log.warn("Inefficient creation of dictionary, to then allocate again.");
 		final int nRows = ubm.getNumValues() + 1;
 		final int nCols = ubm.getNumColumns();
 		if(ubm instanceof Bitmap) {
@@ -149,37 +153,94 @@ public class DictionaryFactory {
 			throw new NotImplementedException(
 				"Not implemented creation of bitmap type : " + ubm.getClass().getSimpleName());
 		}
-
 	}
 
-	public static ADictionary moveFrequentToLastDictionaryEntry(ADictionary dict, ABitmap ubm, int numRows,
+	public static ADictionary moveFrequentToLastDictionaryEntry(ADictionary dict, ABitmap ubm, int nRow,
 		int largestIndex) {
-		LOG.warn("Inefficient creation of dictionary, to then allocate again to move one entry to end.");
-		final double[] dictValues = dict.getValues();
-		final int zeros = numRows - (int) ubm.getNumOffsets();
+		final int zeros = nRow - (int) ubm.getNumOffsets();
 		final int nCol = ubm.getNumColumns();
-		final int offsetToLargest = largestIndex * nCol;
+		final int largestIndexSize = ubm.getOffsetsList(largestIndex).size();
+		if(dict instanceof MatrixBlockDictionary) {
+			MatrixBlockDictionary mbd = (MatrixBlockDictionary) dict;
+			MatrixBlock mb = mbd.getMatrixBlock();
+			if(mb.isEmpty()) {
+				if(zeros == 0)
+					return dict;
+				else
+					return new MatrixBlockDictionary(new MatrixBlock(mb.getNumRows() + 1, mb.getNumColumns(), true));
+			}
+			else if(mb.isInSparseFormat()) {
+				MatrixBlockDictionary mbdn = moveToLastDictionaryEntrySparse(mb.getSparseBlock(), largestIndex, zeros,
+					nCol, largestIndexSize);
+				MatrixBlock mbn = mbdn.getMatrixBlock();
+				mbn.setNonZeros(mb.getNonZeros());
+				if(mbn.getNonZeros() == 0)
+					mbn.recomputeNonZeros();
+				return mbdn;
+			}
+			else
+				return moveToLastDictionaryEntryDense(mb.getDenseBlockValues(), largestIndex, zeros, nCol,
+					largestIndexSize);
+		}
+		else
+			return moveToLastDictionaryEntryDense(dict.getValues(), largestIndex, zeros, nCol, largestIndexSize);
+
+	}
+
+	private static MatrixBlockDictionary moveToLastDictionaryEntrySparse(SparseBlock sb, int indexToMove, int zeros,
+		int nCol, int largestIndexSize) {
+
+		if(zeros == 0) {
+			MatrixBlock ret = new MatrixBlock(sb.numRows(), nCol, true);
+			ret.setSparseBlock(sb);
+			final SparseRow swap = sb.get(indexToMove);
+			for(int i = indexToMove + 1; i < sb.numRows(); i++)
+				sb.set(i - 1, sb.get(i), false);
+			sb.set(sb.numRows() - 1, swap, false);
+			return new MatrixBlockDictionary(ret);
+		}
+
+		MatrixBlock ret = new MatrixBlock(sb.numRows() + 1, nCol, true);
+		ret.allocateSparseRowsBlock();
+		final SparseBlock retB = ret.getSparseBlock();
+		if(zeros > largestIndexSize) {
+			for(int i = 0; i < sb.numRows(); i++)
+				retB.set(i, sb.get(i), false);
+		}
+		else {
+			for(int i = 0; i < indexToMove; i++)
+				retB.set(i, sb.get(i), false);
+
+			retB.set(sb.numRows(), sb.get(indexToMove), false);
+			for(int i = indexToMove + 1; i < sb.numRows(); i++)
+				retB.set(i - 1, sb.get(i), false);
+		}
+		return new MatrixBlockDictionary(ret);
+	}
+
+	private static ADictionary moveToLastDictionaryEntryDense(double[] values, int indexToMove, int zeros, int nCol,
+		int largestIndexSize) {
+		final int offsetToLargest = indexToMove * nCol;
 
 		if(zeros == 0) {
 			final double[] swap = new double[nCol];
-			System.arraycopy(dictValues, offsetToLargest, swap, 0, nCol);
-			for(int i = offsetToLargest; i < dictValues.length - nCol; i++) {
-				dictValues[i] = dictValues[i + nCol];
-			}
-			System.arraycopy(swap, 0, dictValues, dictValues.length - nCol, nCol);
-			return dict;
+			System.arraycopy(values, offsetToLargest, swap, 0, nCol);
+			for(int i = offsetToLargest; i < values.length - nCol; i++)
+				values[i] = values[i + nCol];
+
+			System.arraycopy(swap, 0, values, values.length - nCol, nCol);
+			return new Dictionary(values);
 		}
 
-		final int largestIndexSize = ubm.getOffsetsList(largestIndex).size();
-		final double[] newDict = new double[dictValues.length + nCol];
+		final double[] newDict = new double[values.length + nCol];
 
 		if(zeros > largestIndexSize)
-			System.arraycopy(dictValues, 0, newDict, 0, dictValues.length);
+			System.arraycopy(values, 0, newDict, 0, values.length);
 		else {
-			System.arraycopy(dictValues, 0, newDict, 0, offsetToLargest);
-			System.arraycopy(dictValues, offsetToLargest + nCol, newDict, offsetToLargest,
-				dictValues.length - offsetToLargest - nCol);
-			System.arraycopy(dictValues, offsetToLargest, newDict, newDict.length - nCol, nCol);
+			System.arraycopy(values, 0, newDict, 0, offsetToLargest);
+			System.arraycopy(values, offsetToLargest + nCol, newDict, offsetToLargest,
+				values.length - offsetToLargest - nCol);
+			System.arraycopy(values, offsetToLargest, newDict, newDict.length - nCol, nCol);
 		}
 		return new Dictionary(newDict);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index aae66e6..4c1ab04 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.sysds.runtime.compress.colgroup.dictionary;
 
 import java.io.DataInput;
@@ -5,6 +24,7 @@ import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
@@ -17,595 +37,606 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 public class MatrixBlockDictionary extends ADictionary {
 
-    private MatrixBlock _data;
-
-    public MatrixBlockDictionary(MatrixBlock data) {
-        _data = data;
-    }
-
-    public MatrixBlock getMatrixBlock() {
-        return _data;
-    }
-
-    @Override
-    public double[] getValues() {
-        LOG.warn("Inefficient force dense format.");
-        if(_data.isInSparseFormat())
-            _data.sparseToDense();
-        return _data.getDenseBlockValues();
-    }
-
-    @Override
-    public double getValue(int i) {
-        final int nCol = _data.getNumColumns();
-        LOG.warn("inefficient get value at index");
-        return _data.quickGetValue(i / nCol, i % nCol);
-    }
-
-    @Override
-    public int hasZeroTuple(int nCol) {
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++)
-                if(sb.isEmpty(i))
-                    return i;
-        }
-        else
-            throw new NotImplementedException();
-
-        return -1;
-    }
-
-    @Override
-    public long getInMemorySize() {
-        return 8 + _data.estimateSizeInMemory();
-    }
-
-    public static long getInMemorySize(int numberValues, int numberColumns, double sparsity) {
-        return 8 + MatrixBlock.estimateSizeInMemory(numberValues, numberColumns, sparsity);
-    }
-
-    @Override
-    public double aggregate(double init, Builtin fn) {
-        if(fn.getBuiltinCode() == BuiltinCode.MAX)
-            return fn.execute(init, _data.max());
-        else if(fn.getBuiltinCode() == BuiltinCode.MIN)
-            return fn.execute(init, _data.min());
-        else
-            throw new NotImplementedException();
-    }
-
-    @Override
-    public double[] aggregateTuples(Builtin fn, int nCol) {
-        double[] ret = new double[_data.getNumRows()];
-        if(_data.isEmpty())
-            return ret;
-        else if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final double[] avals = sb.values(i);
-                    ret[i] = avals[apos];
-                    for(int j = apos + 1; j < alen; j++)
-                        ret[i] = fn.execute(ret[i], avals[j]);
-
-                    if(sb.size(i) < _data.getNumColumns())
-                        ret[i] = fn.execute(ret[i], 0);
-                }
-                else
-                    ret[i] = fn.execute(ret[i], 0);
-            }
-        }
-        else if(nCol == 1)
-            return _data.getDenseBlockValues();
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                ret[k] = values[off++];
-                for(int j = 1; j < _data.getNumColumns(); j++)
-                    ret[k] = fn.execute(ret[k], values[off++]);
-            }
-        }
-        return ret;
-    }
-
-    @Override
-    public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
-        if(_data.isEmpty()) {
-            for(int j = 0; j < colIndexes.length; j++) {
-                final int idx = colIndexes[j];
-                c[idx] = fn.execute(c[idx], 0);
-            }
-        }
-        else if(_data.isInSparseFormat()) {
-            MatrixBlock t = LibMatrixReorg.transposeInPlace(_data, 1);
-            if(!t.isInSparseFormat()) {
-                throw new NotImplementedException();
-            }
-            SparseBlock sbt = t.getSparseBlock();
-
-            for(int i = 0; i < _data.getNumColumns(); i++) {
-                final int idx = colIndexes[i];
-                if(!sbt.isEmpty(i)) {
-                    final int apos = sbt.pos(i);
-                    final int alen = sbt.size(i) + apos;
-                    final double[] avals = sbt.values(i);
-                    for(int j = apos; j < alen; j++)
-                        c[idx] = fn.execute(c[idx], avals[j]);
-                    if(avals.length != _data.getNumRows())
-                        c[idx] = fn.execute(c[idx], 0);
-                }
-                else
-                    c[idx] = fn.execute(c[idx], 0);
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final int idx = colIndexes[j];
-                    c[idx] = fn.execute(c[idx], values[off++]);
-                }
-            }
-        }
-    }
-
-    @Override
-    public int size() {
-        return (int) _data.getNonZeros();
-    }
-
-    @Override
-    public ADictionary apply(ScalarOperator op) {
-        MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-        return new MatrixBlockDictionary(res);
-    }
-
-    @Override
-    public ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
-        MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-        final int lastRow = res.getNumRows();
-        MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), true);
-        if(res.isEmpty()) {
-            for(int i = 0; i < numCols; i++)
-                res2.appendValue(lastRow, i, newVal);
-            return new MatrixBlockDictionary(res2);
-        }
-        else {
-            res.append(new MatrixBlock(1, numCols, newVal), res2, false);
-            return new MatrixBlockDictionary(res2);
-        }
-    }
-
-    @Override
-    public ADictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, boolean sparseSafe, int[] colIndexes) {
-        MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
-        for(int i = 0; i < colIndexes.length; i++)
-            rowVector.quickSetValue(0, i, v[colIndexes[i]]);
-        MatrixBlock res = new MatrixBlock();
-        if(sparseSafe) {
-            rowVector.binaryOperations(op, _data, res);
-        }
-        else {
-            if(!_data.isInSparseFormat())
-                LOG.warn("Inefficient binary row op allocating Matrix multiple times");
-            MatrixBlock tmp = new MatrixBlock();
-            tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), tmp, false);
-            rowVector.binaryOperations(op, tmp, res);
-
-        }
-        return new MatrixBlockDictionary(res);
-    }
-
-    @Override
-    public ADictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, boolean sparseSafe, int[] colIndexes) {
-        MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
-        for(int i = 0; i < colIndexes.length; i++)
-            rowVector.quickSetValue(0, i, v[colIndexes[i]]);
-        MatrixBlock res = new MatrixBlock();
-        if(sparseSafe) {
-            _data.binaryOperations(op, rowVector, res);
-        }
-        else {
-            if(!_data.isInSparseFormat())
-                LOG.warn("Inefficient binary row op allocating Matrix multiple times");
-            MatrixBlock tmp = new MatrixBlock();
-            tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), tmp, false);
-            tmp.binaryOperations(op, rowVector, res);
-        }
-        return new MatrixBlockDictionary(res);
-    }
-
-    @Override
-    public ADictionary clone() {
-        MatrixBlock ret = new MatrixBlock();
-        ret.copy(_data);
-        return new MatrixBlockDictionary(ret);
-    }
-
-    @Override
-    public ADictionary cloneAndExtend(int len) {
-        throw new NotImplementedException();
-    }
-
-    @Override
-    public boolean isLossy() {
-        return false;
-    }
-
-    @Override
-    public int getNumberOfValues(int ncol) {
-        return _data.getNumRows();
-    }
-
-    @Override
-    public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-        double[] ret = new double[_data.getNumRows()];
-
-        if(_data.isEmpty())
-            return ret;
-        else if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final double[] avals = sb.values(i);
-                    for(int j = apos; j < alen; j++) {
-                        ret[i] += (square) ? avals[j] * avals[j] : avals[j];
-                    }
-                }
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final double v = values[off++];
-                    ret[k] += (square) ? v * v : v;
-                }
-            }
-        }
-        return ret;
-    }
-
-    @Override
-    public double sumRow(int k, boolean square, int nrColumns) {
-        throw new NotImplementedException();
-    }
-
-    @Override
-    public double[] colSum(int[] counts, int nCol) {
-        if(_data.isEmpty())
-            return null;
-        double[] ret = new double[nCol];
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    // double tmpSum = 0;
-                    final int count = counts[i];
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final int[] aix = sb.indexes(i);
-                    final double[] avals = sb.values(i);
-                    for(int j = apos; j < alen; j++) {
-                        ret[aix[j]] += count * avals[j];
-                    }
-                }
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                final int countK = counts[k];
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final double v = values[off++];
-                    ret[j] += v * countK;
-                }
-            }
-        }
-        return ret;
-    }
-
-    @Override
-    public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
-        if(_data.isEmpty())
-            return;
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    // double tmpSum = 0;
-                    final int count = counts[i];
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final int[] aix = sb.indexes(i);
-                    final double[] avals = sb.values(i);
-                    for(int j = apos; j < alen; j++) {
-                        c[colIndexes[aix[j]]] += square ? count * avals[j] * avals[j] : count * avals[j];
-                    }
-                }
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                final int countK = counts[k];
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final double v = values[off++];
-                    c[colIndexes[j]] += square ? v * v * countK : v * countK;
-                }
-            }
-        }
-    }
-
-    @Override
-    public double sum(int[] counts, int ncol) {
-        double tmpSum = 0;
-        if(_data.isEmpty())
-            return tmpSum;
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    final int count = counts[i];
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final double[] avals = sb.values(i);
-                    for(int j = apos; j < alen; j++) {
-                        tmpSum += count * avals[j];
-                    }
-                }
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                final int countK = counts[k];
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final double v = values[off++];
-                    tmpSum += v * countK;
-                }
-            }
-        }
-        return tmpSum;
-    }
-
-    @Override
-    public double sumsq(int[] counts, int ncol) {
-        double tmpSum = 0;
-        if(_data.isEmpty())
-            return tmpSum;
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sb.isEmpty(i)) {
-                    final int count = counts[i];
-                    final int apos = sb.pos(i);
-                    final int alen = sb.size(i) + apos;
-                    final double[] avals = sb.values(i);
-                    for(int j = apos; j < alen; j++) {
-                        tmpSum += count * avals[j] * avals[j];
-                    }
-                }
-            }
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int k = 0; k < _data.getNumRows(); k++) {
-                final int countK = counts[k];
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    final double v = values[off++];
-                    tmpSum += v * v * countK;
-                }
-            }
-        }
-        return tmpSum;
-    }
-
-    @Override
-    public String getString(int colIndexes) {
-        return _data.toString();
-    }
-
-    @Override
-    public void addMaxAndMin(double[] ret, int[] colIndexes) {
-        throw new NotImplementedException();
-    }
-
-    @Override
-    public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int previousNumberOfColumns) {
-        MatrixBlock retBlock = _data.slice(0, _data.getNumRows() - 1, idxStart, idxEnd - 1);
-        return new MatrixBlockDictionary(retBlock);
-    }
-
-    @Override
-    public ADictionary reExpandColumns(int max) {
-        throw new NotImplementedException();
-    }
-
-    @Override
-    public boolean containsValue(double pattern) {
-        return _data.containsValue(pattern);
-    }
-
-    @Override
-    public long getNumberNonZeros(int[] counts, int nCol) {
-        if(_data.isEmpty())
-            return 0;
-
-        long nnz = 0;
-        if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++)
-                if(!sb.isEmpty(i))
-                    nnz += sb.size(i) * counts[i];
-
-        }
-        else {
-            double[] values = _data.getDenseBlockValues();
-            int off = 0;
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                int countThisTuple = 0;
-                for(int j = 0; j < _data.getNumColumns(); j++) {
-                    double v = values[off++];
-                    if(v != 0)
-                        countThisTuple++;
-                }
-                nnz += countThisTuple * counts[i];
-            }
-        }
-        return nnz;
-    }
-
-    @Override
-    public long getNumberNonZerosContained() {
-        throw new NotImplementedException();
-    }
-
-    @Override
-    public void addToEntry(Dictionary d, int fr, int to, int nCol) {
-        double[] v = d.getValues();
-        if(_data.isEmpty())
-            return;
-        else if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            if(sb.isEmpty(fr))
-                return;
-            final int apos = sb.pos(fr);
-            final int alen = sb.size(fr) + apos;
-            final int[] aix = sb.indexes(fr);
-            final double[] avals = sb.values(fr);
-            final int offsetTo = nCol * to;
-            for(int j = apos; j < alen; j++) {
-                v[offsetTo + aix[j]] += avals[j];
-            }
-        }
-        else {
-            final int sf = nCol * fr; // start from
-            final int ef = sf + nCol; // end from
-            final double[] thisV = _data.getDenseBlockValues();
-            for(int i = sf, j = nCol * to; i < ef; i++, j++) {
-                v[j] += thisV[i];
-            }
-        }
-    }
-
-    @Override
-    public double[] getTuple(int index, int nCol) {
-        if(_data.isEmpty() || index >= _data.getNumRows())
-            return null;
-        else if(_data.isInSparseFormat()) {
-            SparseBlock sb = _data.getSparseBlock();
-            if(sb.isEmpty(index))
-                return null;
-            double[] tuple = new double[nCol];
-            final int apos = sb.pos(index);
-            final int alen = sb.size(index) + apos;
-            final int[] aix = sb.indexes(index);
-            final double[] avals = sb.values(index);
-            for(int j = apos; j < alen; j++) {
-                tuple[aix[j]] = avals[j];
-            }
-            return tuple;
-        }
-        else {
-            double[] tuple = new double[nCol];
-            double[] values = _data.getDenseBlockValues();
-            int offset = index * nCol;
-            for(int i = 0; i < nCol; i++, offset++)
-                tuple[i] = values[offset];
-            return tuple;
-        }
-    }
-
-    @Override
-    public ADictionary subtractTuple(double[] tuple) {
-        DenseBlockFP64 b = new DenseBlockFP64(new int[] {1, tuple.length}, tuple);
-        MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
-        MatrixBlock res = new MatrixBlock(_data.getNumColumns(), _data.getNumRows(), _data.isInSparseFormat());
-        _data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), rowVector, res);
-        return new MatrixBlockDictionary(res);
-    }
-
-    @Override
-    public MatrixBlockDictionary getAsMatrixBlockDictionary(int nCol) {
-        // Simply return this.
-        return this;
-    }
-
-    @Override
-    public String toString() {
-        return "MatrixBlock Dictionary :" + _data.toString();
-    }
-
-    @Override
-    public ADictionary scaleTuples(int[] scaling, int nCol) {
-        if(_data.isEmpty()) {
-            throw new NotImplementedException("could return null here? or empty DictionaryMatrixBlock...");
-        }
-        else if(_data.isInSparseFormat()) {
-            MatrixBlock retBlock = new MatrixBlock(_data.getNumRows(), _data.getNumColumns(), true);
-            retBlock.allocateSparseRowsBlock(true);
-            SparseBlock sbRet = retBlock.getSparseBlock();
-            SparseBlock sbThis = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(!sbThis.isEmpty(i)) {
-                    sbRet.set(i, sbThis.get(i), true);
-
-                    final int count = scaling[i];
-                    final int apos = sbRet.pos(i);
-                    final int alen = sbRet.size(i) + apos;
-                    final double[] avals = sbRet.values(i);
-                    for(int j = apos; j < alen; j++)
-                        avals[j] = count * avals[j];
-                }
-            }
-            retBlock.setNonZeros(_data.getNonZeros());
-            return new MatrixBlockDictionary(retBlock);
-        }
-        else {
-            final double[] _values = _data.getDenseBlockValues();
-            final double[] scaledValues = new double[_values.length];
-            int off = 0;
-            for(int tuple = 0; tuple < _values.length / nCol; tuple++) {
-                final int scale = scaling[tuple];
-                for(int v = 0; v < nCol; v++) {
-                    scaledValues[off] = _values[off] * scale;
-                    off++;
-                }
-            }
-            DenseBlockFP64 db = new DenseBlockFP64(new int[] {_data.getNumRows(), _data.getNumColumns()}, scaledValues);
-            MatrixBlock retBlock = new MatrixBlock(_data.getNumRows(), _data.getNumColumns(), db);
-            retBlock.setNonZeros(_data.getNonZeros());
-            return new MatrixBlockDictionary(retBlock);
-        }
-    }
-
-    @Override
-    public void write(DataOutput out) throws IOException {
-        out.writeByte(DictionaryFactory.Type.MATRIX_BLOCK_DICT.ordinal());
-        _data.write(out);
-    }
-
-    public static MatrixBlockDictionary read(DataInput in) throws IOException {
-        MatrixBlock ret = new MatrixBlock();
-        ret.readFields(in);
-        return new MatrixBlockDictionary(ret);
-    }
-
-    @Override
-    public long getExactSizeOnDisk() {
-        return 1 + _data.getExactSizeOnDisk();
-    }
+	private MatrixBlock _data;
+
+	public MatrixBlockDictionary(MatrixBlock data) {
+		_data = data;
+	}
+
+	public MatrixBlock getMatrixBlock() {
+		return _data;
+	}
+
+	@Override
+	public double[] getValues() {
+		throw new DMLCompressionException("Get Values should not be called when you have a MatrixBlockDictionary");
+	}
+
+	@Override
+	public double getValue(int i) {
+		final int nCol = _data.getNumColumns();
+		final int row = i / nCol;
+		if(row > _data.getNumRows())
+			return 0;
+		final int col = i % nCol;
+		return _data.quickGetValue(row, col);
+	}
+
+	@Override
+	public long getInMemorySize() {
+		return 8 + _data.estimateSizeInMemory();
+	}
+
+	public static long getInMemorySize(int numberValues, int numberColumns, double sparsity) {
+		return 8 + MatrixBlock.estimateSizeInMemory(numberValues, numberColumns, sparsity);
+	}
+
+	@Override
+	public double aggregate(double init, Builtin fn) {
+		if(fn.getBuiltinCode() == BuiltinCode.MAX)
+			return fn.execute(init, _data.max());
+		else if(fn.getBuiltinCode() == BuiltinCode.MIN)
+			return fn.execute(init, _data.min());
+		else
+			throw new NotImplementedException();
+	}
+
+	@Override
+	public double[] aggregateTuples(Builtin fn, int nCol) {
+		double[] ret = new double[_data.getNumRows()];
+		if(_data.isEmpty())
+			return ret;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					ret[i] = avals[apos];
+					for(int j = apos + 1; j < alen; j++)
+						ret[i] = fn.execute(ret[i], avals[j]);
+
+					if(sb.size(i) < _data.getNumColumns())
+						ret[i] = fn.execute(ret[i], 0);
+				}
+				else
+					ret[i] = fn.execute(ret[i], 0);
+			}
+		}
+		else if(nCol == 1)
+			return _data.getDenseBlockValues();
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				ret[k] = values[off++];
+				for(int j = 1; j < _data.getNumColumns(); j++)
+					ret[k] = fn.execute(ret[k], values[off++]);
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
+		if(_data.isEmpty()) {
+			for(int j = 0; j < colIndexes.length; j++) {
+				final int idx = colIndexes[j];
+				c[idx] = fn.execute(c[idx], 0);
+			}
+		}
+		else if(_data.isInSparseFormat()) {
+			MatrixBlock t = LibMatrixReorg.transposeInPlace(_data, 1);
+			if(!t.isInSparseFormat()) {
+				throw new NotImplementedException();
+			}
+			SparseBlock sbt = t.getSparseBlock();
+
+			for(int i = 0; i < _data.getNumColumns(); i++) {
+				final int idx = colIndexes[i];
+				if(!sbt.isEmpty(i)) {
+					final int apos = sbt.pos(i);
+					final int alen = sbt.size(i) + apos;
+					final double[] avals = sbt.values(i);
+					for(int j = apos; j < alen; j++)
+						c[idx] = fn.execute(c[idx], avals[j]);
+					if(alen != _data.getNumRows())
+						c[idx] = fn.execute(c[idx], 0);
+				}
+				else
+					c[idx] = fn.execute(c[idx], 0);
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final int idx = colIndexes[j];
+					c[idx] = fn.execute(c[idx], values[off++]);
+				}
+			}
+		}
+	}
+
+	@Override
+	public ADictionary apply(ScalarOperator op) {
+		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
+		return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
+		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
+		final int lastRow = res.getNumRows();
+		MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), true);
+		if(res.isEmpty()) {
+			for(int i = 0; i < numCols; i++)
+				res2.appendValue(lastRow, i, newVal);
+			return new MatrixBlockDictionary(res2);
+		}
+		else {
+			res.append(new MatrixBlock(1, numCols, newVal), res2, false);
+			return new MatrixBlockDictionary(res2);
+		}
+	}
+
+	@Override
+	public ADictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, boolean sparseSafe, int[] colIndexes) {
+		MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
+		for(int i = 0; i < colIndexes.length; i++)
+			rowVector.quickSetValue(0, i, v[colIndexes[i]]);
+		MatrixBlock res = new MatrixBlock();
+		if(sparseSafe)
+			rowVector.binaryOperations(op, _data, res);
+		else {
+			MatrixBlock tmp = new MatrixBlock();
+			tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), tmp, false);
+			rowVector.binaryOperations(op, tmp, res);
+		}
+		return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, boolean sparseSafe, int[] colIndexes) {
+		MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
+		for(int i = 0; i < colIndexes.length; i++)
+			rowVector.quickSetValue(0, i, v[colIndexes[i]]);
+		MatrixBlock res = new MatrixBlock();
+		if(sparseSafe) {
+			_data.binaryOperations(op, rowVector, res);
+		}
+		else {
+			if(!_data.isInSparseFormat())
+				LOG.warn("Inefficient binary row op allocating Matrix multiple times");
+			MatrixBlock tmp = new MatrixBlock();
+			tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), tmp, false);
+			tmp.binaryOperations(op, rowVector, res);
+		}
+		return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary clone() {
+		MatrixBlock ret = new MatrixBlock();
+		ret.copy(_data);
+		return new MatrixBlockDictionary(ret);
+	}
+
+	@Override
+	public ADictionary cloneAndExtend(int len) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public boolean isLossy() {
+		return false;
+	}
+
+	@Override
+	public int getNumberOfValues(int ncol) {
+		return _data.getNumRows();
+	}
+
+	@Override
+	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
+		double[] ret = new double[_data.getNumRows()];
+
+		if(_data.isEmpty())
+			return ret;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						ret[i] += (square) ? avals[j] * avals[j] : avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					ret[k] += (square) ? v * v : v;
+				}
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public double sumRow(int k, boolean square, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double[] colSum(int[] counts, int nCol) {
+		if(_data.isEmpty())
+			return null;
+		double[] ret = new double[nCol];
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					// double tmpSum = 0;
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						ret[aix[j]] += count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					ret[j] += v * countK;
+				}
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+		if(_data.isEmpty())
+			return;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					// double tmpSum = 0;
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						c[colIndexes[aix[j]]] += square ? count * avals[j] * avals[j] : count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					c[colIndexes[j]] += square ? v * v * countK : v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public double sum(int[] counts, int ncol) {
+		double tmpSum = 0;
+		if(_data.isEmpty())
+			return tmpSum;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						tmpSum += count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					tmpSum += v * countK;
+				}
+			}
+		}
+		return tmpSum;
+	}
+
+	@Override
+	public double sumsq(int[] counts, int ncol) {
+		double tmpSum = 0;
+		if(_data.isEmpty())
+			return tmpSum;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						tmpSum += count * avals[j] * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					tmpSum += v * v * countK;
+				}
+			}
+		}
+		return tmpSum;
+	}
+
+	@Override
+	public String getString(int colIndexes) {
+		return _data.toString();
+	}
+
+	@Override
+	public void addMaxAndMin(double[] ret, int[] colIndexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int previousNumberOfColumns) {
+		MatrixBlock retBlock = _data.slice(0, _data.getNumRows() - 1, idxStart, idxEnd - 1);
+		return new MatrixBlockDictionary(retBlock);
+	}
+
+	@Override
+	public ADictionary reExpandColumns(int max) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public boolean containsValue(double pattern) {
+		return _data.containsValue(pattern);
+	}
+
+	@Override
+	public long getNumberNonZeros(int[] counts, int nCol) {
+		if(_data.isEmpty())
+			return 0;
+
+		long nnz = 0;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++)
+				if(!sb.isEmpty(i))
+					nnz += sb.size(i) * counts[i];
+
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				int countThisTuple = 0;
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					double v = values[off++];
+					if(v != 0)
+						countThisTuple++;
+				}
+				nnz += countThisTuple * counts[i];
+			}
+		}
+		return nnz;
+	}
+
+	@Override
+	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
+		double[] v = d.getValues();
+		if(_data.isEmpty())
+			return;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			if(sb.isEmpty(fr))
+				return;
+			final int apos = sb.pos(fr);
+			final int alen = sb.size(fr) + apos;
+			final int[] aix = sb.indexes(fr);
+			final double[] avals = sb.values(fr);
+			final int offsetTo = nCol * to;
+			for(int j = apos; j < alen; j++) {
+				v[offsetTo + aix[j]] += avals[j];
+			}
+		}
+		else {
+			final int sf = nCol * fr; // start from
+			final int ef = sf + nCol; // end from
+			final double[] thisV = _data.getDenseBlockValues();
+			for(int i = sf, j = nCol * to; i < ef; i++, j++) {
+				v[j] += thisV[i];
+			}
+		}
+	}
+
+	@Override
+	public double[] getTuple(int index, int nCol) {
+		if(_data.isEmpty() || index >= _data.getNumRows())
+			return null;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			if(sb.isEmpty(index))
+				return null;
+			double[] tuple = new double[nCol];
+			final int apos = sb.pos(index);
+			final int alen = sb.size(index) + apos;
+			final int[] aix = sb.indexes(index);
+			final double[] avals = sb.values(index);
+			for(int j = apos; j < alen; j++) {
+				tuple[aix[j]] = avals[j];
+			}
+			return tuple;
+		}
+		else {
+			double[] tuple = new double[nCol];
+			double[] values = _data.getDenseBlockValues();
+			int offset = index * nCol;
+			for(int i = 0; i < nCol; i++, offset++)
+				tuple[i] = values[offset];
+			return tuple;
+		}
+	}
+
+	@Override
+	public ADictionary subtractTuple(double[] tuple) {
+		DenseBlockFP64 b = new DenseBlockFP64(new int[] {1, tuple.length}, tuple);
+		MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
+		MatrixBlock res = new MatrixBlock(_data.getNumColumns(), _data.getNumRows(), _data.isInSparseFormat());
+		_data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), rowVector, res);
+		return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public MatrixBlockDictionary getAsMatrixBlockDictionary(int nCol) {
+		// Simply return this.
+		return this;
+	}
+
+	@Override
+	public String toString() {
+		return "MatrixBlock Dictionary :" + _data.toString();
+	}
+
+	@Override
+	public ADictionary scaleTuples(int[] scaling, int nCol) {
+		if(_data.isEmpty()) {
+			throw new NotImplementedException("could return null here? or empty DictionaryMatrixBlock...");
+		}
+		else if(_data.isInSparseFormat()) {
+			MatrixBlock retBlock = new MatrixBlock(_data.getNumRows(), _data.getNumColumns(), true);
+			retBlock.allocateSparseRowsBlock(true);
+			SparseBlock sbRet = retBlock.getSparseBlock();
+			SparseBlock sbThis = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sbThis.isEmpty(i)) {
+					sbRet.set(i, sbThis.get(i), true);
+
+					final int count = scaling[i];
+					final int apos = sbRet.pos(i);
+					final int alen = sbRet.size(i) + apos;
+					final double[] avals = sbRet.values(i);
+					for(int j = apos; j < alen; j++)
+						avals[j] = count * avals[j];
+				}
+			}
+			retBlock.setNonZeros(_data.getNonZeros());
+			return new MatrixBlockDictionary(retBlock);
+		}
... 4520 lines suppressed ...