You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2021/12/13 16:30:47 UTC

[systemds] branch main updated (b9f4686 -> 64af381)

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git.


    from b9f4686  [SYSTEMDS-3245] CSR sparse support for tansformapply
     new f30dc39  [SYSTEMDS-3247] BinaryCell colVector and rowVector extensions
     new 148092c  [MINOR] Revert processAddRow to not use compressed
     new 64af381  [SYSTEMDS-3226,SYSTEMDS-3246] PFOR column group and PreAgg Cache blocking

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../runtime/compress/CompressedMatrixBlock.java    |  67 +--
 .../compress/CompressedMatrixBlockFactory.java     |   4 -
 .../sysds/runtime/compress/colgroup/AColGroup.java |  75 +--
 .../compress/colgroup/AColGroupCompressed.java     |  37 +-
 .../runtime/compress/colgroup/AColGroupOffset.java |   3 +-
 .../runtime/compress/colgroup/AColGroupValue.java  | 115 +---
 .../compress/colgroup/AMorphingMMColGroup.java     | 101 ++++
 .../sysds/runtime/compress/colgroup/APreAgg.java   | 111 ++--
 .../runtime/compress/colgroup/ColGroupConst.java   |  94 +--
 .../runtime/compress/colgroup/ColGroupDDC.java     |  52 +-
 .../runtime/compress/colgroup/ColGroupEmpty.java   |  44 +-
 .../runtime/compress/colgroup/ColGroupFactory.java | 118 ++--
 .../runtime/compress/colgroup/ColGroupIO.java      |   2 +
 .../runtime/compress/colgroup/ColGroupOLE.java     | 184 +++---
 .../runtime/compress/colgroup/ColGroupPFOR.java    | 384 ++++++++++++
 .../runtime/compress/colgroup/ColGroupRLE.java     | 260 ++++++---
 .../runtime/compress/colgroup/ColGroupSDC.java     | 331 +++++------
 .../compress/colgroup/ColGroupSDCSingle.java       | 223 +++----
 .../compress/colgroup/ColGroupSDCSingleZeros.java  | 399 ++++++++-----
 .../compress/colgroup/ColGroupSDCZeros.java        | 498 +++++++++-------
 .../runtime/compress/colgroup/ColGroupSizes.java   |   4 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  71 +--
 .../runtime/compress/colgroup/ColGroupUtils.java   |  65 +++
 .../compress/colgroup/dictionary/ADictionary.java  | 210 ++++++-
 .../compress/colgroup/dictionary/Dictionary.java   | 316 ++++++++--
 .../colgroup/dictionary/DictionaryFactory.java     |  12 +-
 .../colgroup/dictionary/MatrixBlockDictionary.java | 647 ++++++++++++++++++++-
 .../compress/colgroup/dictionary/QDictionary.java  | 144 ++++-
 .../colgroup/insertionsort/AInsertionSorter.java   |   4 +-
 .../insertionsort/InsertionSorterFactory.java      |   3 +
 .../colgroup/insertionsort/MaterializeSort.java    |   7 +-
 .../compress/colgroup/mapping/AMapToData.java      |  99 +++-
 .../compress/colgroup/mapping/MapToBit.java        |  55 +-
 .../compress/colgroup/mapping/MapToByte.java       | 118 +++-
 .../compress/colgroup/mapping/MapToChar.java       |  79 ++-
 .../compress/colgroup/mapping/MapToFactory.java    |  29 +-
 .../compress/colgroup/mapping/MapToInt.java        |  53 +-
 .../compress/colgroup/offset/AIterator.java        |  52 +-
 .../runtime/compress/colgroup/offset/AOffset.java  | 329 ++++++++++-
 .../compress/colgroup/offset/OffsetByte.java       | 526 ++++++++++++++++-
 .../compress/colgroup/offset/OffsetChar.java       | 249 +++++++-
 .../compress/colgroup/offset/OffsetFactory.java    |  32 +-
 .../sysds/runtime/compress/lib/CLALibAppend.java   |   9 +-
 .../runtime/compress/lib/CLALibBinaryCellOp.java   |  78 ++-
 .../sysds/runtime/compress/lib/CLALibCompAgg.java  |  50 +-
 .../runtime/compress/lib/CLALibDecompress.java     | 209 ++++---
 .../runtime/compress/lib/CLALibLeftMultBy.java     | 272 +++++----
 .../runtime/compress/lib/CLALibRightMultBy.java    |   4 +-
 .../sysds/runtime/compress/lib/CLALibSlice.java    | 104 ++++
 .../sysds/runtime/compress/lib/CLALibUtils.java    | 112 ++--
 .../runtime/matrix/data/LibMatrixBincell.java      |  55 +-
 .../sysds/runtime/matrix/data/MatrixBlock.java     |   9 +-
 src/test/java/org/apache/sysds/test/TestUtils.java |   2 +-
 .../component/compress/CompressedMatrixTest.java   |  12 +
 .../component/compress/CompressedTestBase.java     |  95 ++-
 .../component/compress/ExtendedMatrixTests.java    |  26 +-
 .../test/component/compress/TestConstants.java     |   6 +-
 .../compress/colgroup/JolEstimateRLETest.java      |   7 +-
 .../insertionsort/TestInsertionSorters.java        |  16 +-
 .../compress/mapping/MappingPreAggregateTests.java | 293 +++++++++-
 .../component/compress/mapping/MappingTests.java   |  31 +-
 .../compress/mapping/StandAloneTests.java          |   5 +
 .../compress/offset/OffsetNegativeTests.java       |  92 ---
 .../compress/offset/OffsetSingleTests.java         |  17 +-
 .../compress/offset/OffsetTestPreAggregate.java    | 378 ++++++++++++
 .../compress/offset/OffsetTestPreAggregateBit.java | 128 ++++
 .../offset/OffsetTestPreAggregateByte.java         | 132 +++++
 .../offset/OffsetTestPreAggregateChar.java         | 150 +++++
 .../offset/OffsetTestPreAggregateSparse.java       | 169 ++++++
 ...s.java => OffsetTestPreAggregateSparseBit.java} |  33 +-
 .../offset/OffsetTestPreAggregateSparseByte.java   |  57 ++
 .../offset/OffsetTestPreAggregateSparseChar.java   |  66 +++
 .../component/compress/offset/OffsetTests.java     | 243 +++++++-
 .../offset/OffsetTestsDefaultConstructor.java      | 116 ++++
 .../test/functions/compress/LocalInstruction.java  |   5 +-
 75 files changed, 7044 insertions(+), 2143 deletions(-)
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
 create mode 100644 src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetNegativeTests.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregate.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateBit.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateByte.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateChar.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateSparse.java
 copy src/test/java/org/apache/sysds/test/component/compress/offset/{OffsetSingleTests.java => OffsetTestPreAggregateSparseBit.java} (54%)
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateSparseByte.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestPreAggregateSparseChar.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/compress/offset/OffsetTestsDefaultConstructor.java

[systemds] 01/03: [SYSTEMDS-3247] BinaryCell colVector and rowVector extensions

Posted by ba...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit f30dc391ef76aae7455d24d1f5f78732a58f7fee
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Mon Dec 13 17:15:27 2021 +0100

    [SYSTEMDS-3247] BinaryCell colVector and rowVector extensions
    
    This commit adds the primitives for colVector and rowVector binary
    cell operations. This is added to support:
    
    y = v / m
    
    currently we support:
    
    y = m / v
---
 .../runtime/matrix/data/LibMatrixBincell.java      | 55 +++++++++++++++++++++-
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
index f949c6c..4651165 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixBincell.java
@@ -78,11 +78,15 @@ public class LibMatrixBincell {
 		MATRIX_MATRIX,
 		MATRIX_COL_VECTOR,
 		MATRIX_ROW_VECTOR,
+		COL_VECTOR_MATRIX,
+		ROW_VECTOR_MATRIX,
 		OUTER_VECTOR_VECTOR,
 		INVALID;
 		public boolean isMatrixVector() {
 			return this == MATRIX_COL_VECTOR
-				|| this == MATRIX_ROW_VECTOR;
+				|| this == MATRIX_ROW_VECTOR
+				|| this == COL_VECTOR_MATRIX
+				|| this == ROW_VECTOR_MATRIX;
 		}
 	}
 	
@@ -357,6 +361,32 @@ public class LibMatrixBincell {
 			return BinaryAccessType.INVALID;
 	}
 
+	public static BinaryAccessType getBinaryAccessTypeExtended(MatrixBlock m1, MatrixBlock m2) {
+		final int rlen1 = m1.rlen;
+		final int rlen2 = m2.rlen;
+		final int clen1 = m1.clen;
+		final int clen2 = m2.clen;
+
+		if(rlen1 == rlen2) {
+			if(clen1 == clen2)
+				return BinaryAccessType.MATRIX_MATRIX;
+			else if(clen1 < clen2)
+				return BinaryAccessType.COL_VECTOR_MATRIX;
+			else
+				return BinaryAccessType.MATRIX_COL_VECTOR;
+		}
+		else if(clen1 == clen2) {
+			if(rlen1 < rlen2)
+				return BinaryAccessType.ROW_VECTOR_MATRIX;
+			else
+				return BinaryAccessType.MATRIX_ROW_VECTOR;
+		}
+		else if(clen1 == 1 && rlen2 == 1)
+			return BinaryAccessType.OUTER_VECTOR_VECTOR;
+		else
+			return BinaryAccessType.INVALID;
+	}
+
 	public static void isValidDimensionsBinary(MatrixBlock m1, MatrixBlock m2)
 	{
 		final int rlen1 = m1.rlen;
@@ -369,7 +399,7 @@ public class LibMatrixBincell {
 		//2) MV operations w/ V either being a right-hand-side column or row vector 
 		//  (where one dimension needs to match and the other dimension is 1)
 		//3) VV outer vector operations w/ a common dimension of 1 
-		boolean isValid = (   (rlen1 == rlen2 && clen1==clen2)            //MM 
+		boolean isValid = (   (rlen1 == rlen2 && clen1==clen2)        //MM 
 							|| (rlen1 == rlen2 && clen1 > 1 && clen2 == 1) //MVc
 							|| (clen1 == clen2 && rlen1 > 1 && rlen2 == 1) //MVr
 							|| (clen1 == 1 && rlen2 == 1 ) );              //VV
@@ -380,6 +410,27 @@ public class LibMatrixBincell {
 		}
 	}
 
+	public static void isValidDimensionsBinaryExtended(MatrixBlock m1, MatrixBlock m2) {
+		final int rlen1 = m1.rlen;
+		final int clen1 = m1.clen;
+		final int rlen2 = m2.rlen;
+		final int clen2 = m2.clen;
+
+		// Added extra 2 options
+		// 2a) VM operations with V either being a left-hand-side column or row vector.
+		boolean isValid = ((rlen1 == rlen2 && clen1 == clen2) // MM
+			|| (rlen1 == rlen2 && clen1 > 1 && clen2 == 1) // MVc
+			|| (rlen1 == rlen2 && clen1 == 1 && clen2 > 1) // VMc
+			|| (clen1 == clen2 && rlen1 > 1 && rlen2 == 1) // MVr
+			|| (clen1 == clen2 && rlen1 == 1 && rlen2 > 1) // VMr
+			|| (clen1 == 1 && rlen2 == 1)); // VV
+
+		if(!isValid) {
+			throw new RuntimeException("Block sizes are not matched for binary " + "cell operations: " + rlen1 + "x"
+				+ clen1 + " vs " + rlen2 + "x" + clen2);
+		}
+	}
+
 	public static boolean isSparseSafeDivide(BinaryOperator op, MatrixBlock rhs)
 	{
 		//if rhs is fully dense, there cannot be a /0 and hence DIV becomes sparse safe

[systemds] 03/03: [SYSTEMDS-3226, SYSTEMDS-3246] PFOR column group and PreAgg Cache blocking

Posted by ba...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 64af38175350880262b54c7d35ca8bbbb5419ace
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Sun Nov 21 18:16:08 2021 +0100

    [SYSTEMDS-3226,SYSTEMDS-3246] PFOR column group and PreAgg Cache blocking
    
    This commit adds PFOR (Patched frame of reference) column group.
    This new group is building upon the SDC group with a simple reference
    tuple offset for each dictionary entry.
    
    Also contained is optimizations for Left Matrix Multiplication to allow
    further cache friendly behavior while collecting the preAggregates for
    multiplication. (L1 Cache miss down from 60% to 10-20%)
    
    This commit also introduce the concept of morphing between column groups,
    dense SDC, or PFOR groups now transform into SDCZero groups before many
    operations for efficient execution.
    
    Closes #1464
---
 .../runtime/compress/CompressedMatrixBlock.java    |  67 +--
 .../compress/CompressedMatrixBlockFactory.java     |   4 -
 .../sysds/runtime/compress/colgroup/AColGroup.java |  75 +--
 .../compress/colgroup/AColGroupCompressed.java     |  37 +-
 .../runtime/compress/colgroup/AColGroupOffset.java |   3 +-
 .../runtime/compress/colgroup/AColGroupValue.java  | 115 +---
 .../compress/colgroup/AMorphingMMColGroup.java     | 101 ++++
 .../sysds/runtime/compress/colgroup/APreAgg.java   | 111 ++--
 .../runtime/compress/colgroup/ColGroupConst.java   |  94 +--
 .../runtime/compress/colgroup/ColGroupDDC.java     |  52 +-
 .../runtime/compress/colgroup/ColGroupEmpty.java   |  44 +-
 .../runtime/compress/colgroup/ColGroupFactory.java | 118 ++--
 .../runtime/compress/colgroup/ColGroupIO.java      |   2 +
 .../runtime/compress/colgroup/ColGroupOLE.java     | 184 +++---
 .../runtime/compress/colgroup/ColGroupPFOR.java    | 384 ++++++++++++
 .../runtime/compress/colgroup/ColGroupRLE.java     | 260 ++++++---
 .../runtime/compress/colgroup/ColGroupSDC.java     | 331 +++++------
 .../compress/colgroup/ColGroupSDCSingle.java       | 223 +++----
 .../compress/colgroup/ColGroupSDCSingleZeros.java  | 399 ++++++++-----
 .../compress/colgroup/ColGroupSDCZeros.java        | 498 +++++++++-------
 .../runtime/compress/colgroup/ColGroupSizes.java   |   4 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  71 +--
 .../runtime/compress/colgroup/ColGroupUtils.java   |  65 +++
 .../compress/colgroup/dictionary/ADictionary.java  | 210 ++++++-
 .../compress/colgroup/dictionary/Dictionary.java   | 316 ++++++++--
 .../colgroup/dictionary/DictionaryFactory.java     |  12 +-
 .../colgroup/dictionary/MatrixBlockDictionary.java | 647 ++++++++++++++++++++-
 .../compress/colgroup/dictionary/QDictionary.java  | 144 ++++-
 .../colgroup/insertionsort/AInsertionSorter.java   |   4 +-
 .../insertionsort/InsertionSorterFactory.java      |   3 +
 .../colgroup/insertionsort/MaterializeSort.java    |   7 +-
 .../compress/colgroup/mapping/AMapToData.java      |  99 +++-
 .../compress/colgroup/mapping/MapToBit.java        |  55 +-
 .../compress/colgroup/mapping/MapToByte.java       | 118 +++-
 .../compress/colgroup/mapping/MapToChar.java       |  79 ++-
 .../compress/colgroup/mapping/MapToFactory.java    |  29 +-
 .../compress/colgroup/mapping/MapToInt.java        |  53 +-
 .../compress/colgroup/offset/AIterator.java        |  52 +-
 .../runtime/compress/colgroup/offset/AOffset.java  | 329 ++++++++++-
 .../compress/colgroup/offset/OffsetByte.java       | 526 ++++++++++++++++-
 .../compress/colgroup/offset/OffsetChar.java       | 249 +++++++-
 .../compress/colgroup/offset/OffsetFactory.java    |  32 +-
 .../sysds/runtime/compress/lib/CLALibAppend.java   |   9 +-
 .../runtime/compress/lib/CLALibBinaryCellOp.java   |  78 ++-
 .../sysds/runtime/compress/lib/CLALibCompAgg.java  |  50 +-
 .../runtime/compress/lib/CLALibDecompress.java     | 209 ++++---
 .../runtime/compress/lib/CLALibLeftMultBy.java     | 272 +++++----
 .../runtime/compress/lib/CLALibRightMultBy.java    |   4 +-
 .../sysds/runtime/compress/lib/CLALibSlice.java    | 104 ++++
 .../sysds/runtime/compress/lib/CLALibUtils.java    | 112 ++--
 .../component/compress/CompressedMatrixTest.java   |  12 +
 .../component/compress/CompressedTestBase.java     |  95 ++-
 .../component/compress/ExtendedMatrixTests.java    |  26 +-
 .../test/component/compress/TestConstants.java     |   6 +-
 .../compress/colgroup/JolEstimateRLETest.java      |   7 +-
 .../insertionsort/TestInsertionSorters.java        |  16 +-
 .../compress/mapping/MappingPreAggregateTests.java | 293 +++++++++-
 .../component/compress/mapping/MappingTests.java   |  31 +-
 .../compress/mapping/StandAloneTests.java          |   5 +
 .../compress/offset/OffsetNegativeTests.java       |  92 ---
 .../compress/offset/OffsetSingleTests.java         |  17 +-
 .../compress/offset/OffsetTestPreAggregate.java    | 378 ++++++++++++
 .../compress/offset/OffsetTestPreAggregateBit.java | 128 ++++
 .../offset/OffsetTestPreAggregateByte.java         | 132 +++++
 .../offset/OffsetTestPreAggregateChar.java         | 150 +++++
 .../offset/OffsetTestPreAggregateSparse.java       | 169 ++++++
 ...s.java => OffsetTestPreAggregateSparseBit.java} |  33 +-
 .../offset/OffsetTestPreAggregateSparseByte.java   |  57 ++
 .../offset/OffsetTestPreAggregateSparseChar.java   |  66 +++
 .../component/compress/offset/OffsetTests.java     | 243 +++++++-
 .../offset/OffsetTestsDefaultConstructor.java      | 116 ++++
 .../test/functions/compress/LocalInstruction.java  |   5 +-
 72 files changed, 6986 insertions(+), 2135 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index c8bdd0a..38af860 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -55,8 +55,10 @@ import org.apache.sysds.runtime.compress.lib.CLALibLeftMultBy;
 import org.apache.sysds.runtime.compress.lib.CLALibReExpand;
 import org.apache.sysds.runtime.compress.lib.CLALibRightMultBy;
 import org.apache.sysds.runtime.compress.lib.CLALibScalar;
+import org.apache.sysds.runtime.compress.lib.CLALibSlice;
 import org.apache.sysds.runtime.compress.lib.CLALibSquash;
 import org.apache.sysds.runtime.compress.lib.CLALibUnary;
+import org.apache.sysds.runtime.compress.lib.CLALibUtils;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
@@ -658,13 +660,6 @@ public class CompressedMatrixBlock extends MatrixBlock {
 		return tmp.reorgOperations(op, ret, startRow, startColumn, length);
 	}
 
-	public ColGroupUncompressed getUncompressedColGroup() {
-		for(AColGroup grp : _colGroups)
-			if(grp instanceof ColGroupUncompressed)
-				return (ColGroupUncompressed) grp;
-		return null;
-	}
-
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
@@ -691,61 +686,14 @@ public class CompressedMatrixBlock extends MatrixBlock {
 	@Override
 	public MatrixBlock slice(int rl, int ru, int cl, int cu, boolean deep, CacheBlock ret) {
 		validateSliceArgument(rl, ru, cl, cu);
-		MatrixBlock tmp;
-		if(rl == ru && cl == cu) {
-			// get a single index, and return in a matrixBlock
-			tmp = new MatrixBlock(1, 1, 0);
-			tmp.appendValue(0, 0, getValue(rl, cl));
-			return tmp;
-		}
-		else if(rl == 0 && ru == getNumRows() - 1) {
-			tmp = sliceColumns(cl, cu);
-			tmp.recomputeNonZeros();
-			return tmp;
-		}
-		else if(cl == 0 && cu == getNumColumns() - 1) {
-			// Row Slice. Potential optimization if the slice contains enough rows.
-			// +1 since the implementation arguments for slice is inclusive values for ru
-			// and cu. It is not inclusive in decompression, and construction of MatrixBlock.
-			tmp = new MatrixBlock(ru + 1 - rl, getNumColumns(), false).allocateDenseBlock();
-			for(AColGroup g : getColGroups())
-				g.decompressToBlock(tmp, rl, ru + 1, -rl, 0);
-			tmp.recomputeNonZeros();
-			tmp.examSparsity();
-			return tmp;
-		}
-		else {
-			// In the case where an internal matrix is sliced out, then first slice out the
-			// columns to an compressed intermediate.
-			tmp = sliceColumns(cl, cu);
-			// Then call slice recursively, to do the row slice.
-			// Since we do not copy the index structure but simply maintain a pointer to the
-			// original this is fine.
-			tmp = tmp.slice(rl, ru, 0, tmp.getNumColumns() - 1, ret);
-			return tmp;
-		}
-	}
-
-	private CompressedMatrixBlock sliceColumns(int cl, int cu) {
-		CompressedMatrixBlock ret = new CompressedMatrixBlock(this.getNumRows(), cu + 1 - cl);
-		List<AColGroup> newColGroups = new ArrayList<>();
-		for(AColGroup grp : getColGroups()) {
-			AColGroup slice = grp.sliceColumns(cl, cu + 1);
-			if(slice != null)
-				newColGroups.add(slice);
-		}
-		ret.allocateColGroupList(newColGroups);
-		ret.recomputeNonZeros();
-		ret.overlappingColGroups = this.isOverlapping();
-		return ret;
+		return CLALibSlice.slice(this, rl, ru, cl, cu, deep);
 	}
 
 	@Override
 	public void slice(ArrayList<IndexedMatrixValue> outlist, IndexRange range, int rowCut, int colCut, int blen,
 		int boundaryRlen, int boundaryClen) {
-		printDecompressWarning(
+		MatrixBlock tmp = getUncompressed(
 			"slice for distribution to spark. (Could be implemented such that it does not decompress)");
-		MatrixBlock tmp = getUncompressed();
 		tmp.slice(outlist, range, rowCut, colCut, blen, boundaryRlen, boundaryClen);
 	}
 
@@ -1359,7 +1307,12 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
 	@Override
 	public void compactEmptyBlock() {
-		// do nothing
+		if(isEmptyBlock(false)) {
+			cleanupBlock(true, true);
+			CLALibUtils.combineConstColumns(this);
+			overlappingColGroups = false;
+			decompressedVersion = null;
+		}
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 42ea6a7..97f6f09 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -250,10 +250,6 @@ public class CompressedMatrixBlockFactory {
 		if(res == null)
 			return abortCompression();
 
-		if(compSettings.isInSparkInstruction) {
-			// clear soft reference to uncompressed block in case of spark.
-			res.clearSoftReferenceToDecompressed();
-		}
 		return new ImmutablePair<>(res, _stats);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index 27a29cb..af4d757 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -48,7 +48,7 @@ public abstract class AColGroup implements Serializable {
 
 	/** Public super types of compression ColGroups supported */
 	public enum CompressionType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, PFOR,
 	}
 
 	/**
@@ -57,7 +57,7 @@ public abstract class AColGroup implements Serializable {
 	 * Protected such that outside the ColGroup package it should be unknown which specific subtype is used.
 	 */
 	protected enum ColGroupType {
-		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros;
+		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros, PFOR;
 	}
 
 	/** The ColGroup Indexes contained in the ColGroup */
@@ -132,14 +132,27 @@ public abstract class AColGroup implements Serializable {
 	}
 
 	/**
-	 * Decompress the contents of the column group into the target matrix,.
+	 * Decompress a range of rows into a sparse block
 	 * 
-	 * @param target A matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression from
-	 * @param ru     Row to end decompression at (not inclusive)
+	 * Note that this is using append, so the sparse column indexes need to be sorted afterwards.
+	 * 
+	 * @param sb Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
+	 */
+	public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) {
+		decompressToSparseBlock(sb, rl, ru, 0, 0);
+	}
+
+	/**
+	 * Decompress a range of rows into a dense block
+	 * 
+	 * @param db Sparse Target block
+	 * @param rl Row to start at
+	 * @param ru Row to end at
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		decompressToBlock(target, rl, ru, 0, 0);
+	public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) {
+		decompressToDenseBlock(db, rl, ru, 0, 0);
 	}
 
 	/**
@@ -326,33 +339,29 @@ public abstract class AColGroup implements Serializable {
 	protected abstract ColGroupType getColGroupType();
 
 	/**
-	 * Decompress the contents of the column group without counting non zeros
+	 * Decompress into the DenseBlock. (no NNZ handling)
 	 * 
-	 * The offsets helps us decompress into specific target areas of the output matrix.
-	 * 
-	 * If OffR and OffC is 0, then decompression output starts at row offset equal to rl,
+	 * @param db   Target DenseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
+	 */
+	public abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC);
+
+	/**
+	 * Decompress into the SparseBlock. (no NNZ handling)
 	 * 
-	 * If for instance a MiniBatch of rows 10 to 15, then target would be 5 rows high and arguments would look like:
-	 *
-	 * cg.decompressToBlock(target, 10, 15, -10, 0)
+	 * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted
+	 * afterwards
 	 * 
-	 * @param target a matrix block where the columns covered by this column group have not yet been filled in.
-	 * @param rl     Row to start decompression at.
-	 * @param ru     Row to end decompression at (not inclusive).
-	 * @param offR   RowOffset into target to assign from.
-	 * @param offC   ColumnOffset into the target matrix to assign from.
+	 * @param sb   Target SparseBlock
+	 * @param rl   Row to start decompression from
+	 * @param ru   Row to end decompression at
+	 * @param offR Row offset into the target to decompress
+	 * @param offC Column offset into the target to decompress
 	 */
-	public final void decompressToBlock(MatrixBlock target, int rl, int ru, int offR, int offC){
-		if(target.isInSparseFormat())
-			decompressToSparseBlock(target.getSparseBlock(), rl, ru, offR, offC);
-		else
-			decompressToDenseBlock(target.getDenseBlock(), rl, ru, offR, offC);
-	}
-
-
-	protected abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru,int offR, int offC);
-
-	protected abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
+	public abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);
 
 	/**
 	 * Right matrix multiplication with this column group.
@@ -536,9 +545,9 @@ public abstract class AColGroup implements Serializable {
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		sb.append(" ColGroupType: ");
+		sb.append(String.format("\n\n%15s", "ColGroupType: "));
 		sb.append(this.getClass().getSimpleName());
-		sb.append(String.format("\n%15s%5d ", "Columns:", _colIndexes.length));
+		sb.append(String.format("\n%15s", "Columns: "));
 		sb.append(Arrays.toString(_colIndexes));
 
 		return sb.toString();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
index 106a2df..90cd5c9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupCompressed.java
@@ -55,11 +55,15 @@ public abstract class AColGroupCompressed extends AColGroup {
 
 	protected abstract void computeColMxx(double[] c, Builtin builtin);
 
-	protected abstract void computeSum(double[] c, int nRows, boolean square);
+	protected abstract void computeSum(double[] c, int nRows);
 
-	protected abstract void computeRowSums(double[] c, boolean square, int rl, int ru);
+	protected abstract void computeRowSums(double[] c, int rl, int ru);
 
-	protected abstract void computeColSums(double[] c, int nRows, boolean square);
+	protected abstract void computeSumSq(double[] c, int nRows);
+
+	protected abstract void computeRowSumsSq(double[] c, int rl, int ru);
+
+	protected abstract void computeColSumsSq(double[] c, int nRows);
 
 	protected abstract void computeRowMxx(double[] c, Builtin builtin, int rl, int ru);
 
@@ -80,21 +84,26 @@ public abstract class AColGroupCompressed extends AColGroup {
 	}
 
 	@Override
-	public void computeColSums(double[] c, int nRows) {
-		computeColSums(c, nRows, false);
-	}
-
-	@Override
 	public final void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int nRows, int rl, int ru) {
 		final ValueFunction fn = op.aggOp.increOp.fn;
 		if(fn instanceof Plus || fn instanceof KahanPlus || fn instanceof KahanPlusSq) {
 			boolean square = fn instanceof KahanPlusSq;
-			if(op.indexFn instanceof ReduceAll)
-				computeSum(c, nRows, square);
-			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(c, square, rl, ru);
-			else if(op.indexFn instanceof ReduceRow)
-				computeColSums(c, nRows, square);
+			if(square){
+				if(op.indexFn instanceof ReduceAll)
+					computeSumSq(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSumsSq(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSumsSq(c, nRows);
+			}
+			else{
+				if(op.indexFn instanceof ReduceAll)
+					computeSum(c, nRows);
+				else if(op.indexFn instanceof ReduceCol)
+					computeRowSums(c, rl, ru);
+				else if(op.indexFn instanceof ReduceRow)
+					computeColSums(c, nRows);
+			}
 		}
 		else if(fn instanceof Multiply) {
 			if(op.indexFn instanceof ReduceAll)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupOffset.java
index 25b5839..687dbad 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupOffset.java
@@ -177,13 +177,14 @@ public abstract class AColGroupOffset extends AColGroupValue {
 	}
 
 	protected abstract boolean[] computeZeroIndicatorVector();
+
 	public abstract void countNonZerosPerRow(int[] rnnz, int rl, int ru);
 
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s%5d ", "Pointers:", this._ptr.length));
+		sb.append(String.format("\n%15s%5d", "Pointers:", this._ptr.length));
 		sb.append(Arrays.toString(this._ptr));
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
index 067fa6f..b38bae1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
@@ -36,9 +36,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictiona
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
  * Base class for column groups encoded with value dictionary. This include column groups such as DDC OLE and RLE.
@@ -171,7 +169,7 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 		double[] values);
 
 	@Override
-	public final int getNumValues() {
+	public int getNumValues() {
 		return _dict.getNumberOfValues(_colIndexes.length);
 	}
 
@@ -286,15 +284,14 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	}
 
 	@Override
-	protected final double computeMxx(double c, Builtin builtin) {
+	protected double computeMxx(double c, Builtin builtin) {
 		if(_zeros)
 			c = builtin.execute(c, 0);
 		return _dict.aggregate(c, builtin);
-
 	}
 
 	@Override
-	protected final void computeColMxx(double[] c, Builtin builtin) {
+	protected void computeColMxx(double[] c, Builtin builtin) {
 		if(_zeros)
 			for(int x = 0; x < _colIndexes.length; x++)
 				c[_colIndexes[x]] = builtin.execute(c[_colIndexes[x]], 0);
@@ -302,40 +299,6 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 		_dict.aggregateCols(c, builtin, _colIndexes);
 	}
 
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary.
-	 * 
-	 * @param op scalar operation to perform
-	 * @return transformed copy of value metadata for this column group
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op) {
-		return _dict.clone().inplaceScalarOp(op);
-	}
-
-	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary. This
-	 * specific method is used in cases where an new entry is to be added in the dictionary.
-	 * 
-	 * Method should only be called if the newVal is not 0! Also the newVal should already have the operator applied.
-	 * 
-	 * @param op      The Operator to apply to the underlying data.
-	 * @param newVal  The new Value to append to the underlying data.
-	 * @param numCols The number of columns in the ColGroup, to specify how many copies of the newVal should be appended.
-	 * @return The new Dictionary containing the values.
-	 */
-	protected final ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
-		return _dict.applyScalarOp(op, newVal, numCols);
-	}
-
-	protected static double[] allocDVector(int len, boolean reset) {
-		return new double[len];
-	}
-
-	protected static int[] allocIVector(int len, boolean reset) {
-		LOG.error("deprecated allocIVector");
-		return new int[len + 1];
-	}
-
 	@Override
 	public void readFields(DataInput in) throws IOException {
 		super.readFields(in);
@@ -362,16 +325,23 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	public abstract int[] getCounts(int[] out);
 
 	@Override
-	protected final void computeSum(double[] c, int nRows, boolean square) {
-		if(square)
-			c[0] += _dict.sumsq(getCounts(), _colIndexes.length);
-		else
-			c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, getCounts(), _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _colIndexes.length);
 	}
 
 	@Override
-	protected final void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, getCounts(), _colIndexes, square);
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes);
 	}
 
 	@Override
@@ -425,7 +395,7 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	}
 
 	@Override
-	protected final AColGroup sliceSingleColumn(int idx) {
+	protected AColGroup sliceSingleColumn(int idx) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._colIndexes = new int[] {0};
 		if(_colIndexes.length == 1)
@@ -437,7 +407,7 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	}
 
 	@Override
-	protected final AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		final AColGroupValue ret = (AColGroupValue) copy();
 		ret._dict = ret._dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
 		ret._colIndexes = outputCols;
@@ -445,63 +415,27 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	}
 
 	@Override
-	protected final void tsmm(double[] result, int numColumns, int nRows) {
+	protected void tsmm(double[] result, int numColumns, int nRows) {
 		final int[] counts = getCounts();
 		tsmm(result, numColumns, counts, _dict, _colIndexes);
 	}
 
 	@Override
-	public final boolean containsValue(double pattern) {
+	public boolean containsValue(double pattern) {
 		if(pattern == 0 && _zeros)
 			return true;
 		return _dict.containsValue(pattern);
 	}
 
 	@Override
-	public final long getNumberNonZeros(int nRows) {
+	public long getNumberNonZeros(int nRows) {
 		int[] counts = getCounts();
 		return _dict.getNumberNonZeros(counts, _colIndexes.length);
 	}
 
-	public final MatrixBlock leftMultByPreAggregateMatrix(MatrixBlock preAgg, MatrixBlock tmpRes) {
-		// Get dictionary.
-		MatrixBlock dictM = forceMatrixBlockDictionary().getMatrixBlock();
-		LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
-		return tmpRes;
-	}
-
-	private MatrixBlockDictionary forceMatrixBlockDictionary() {
+	public synchronized void forceMatrixBlockDictionary() {
 		if(!(_dict instanceof MatrixBlockDictionary))
 			_dict = _dict.getMBDict(_colIndexes.length);
-		return((MatrixBlockDictionary) _dict);
-	}
-
-	public final void addMatrixToResult(MatrixBlock tmp, MatrixBlock result, int rl, int ru) {
-		if(tmp.isEmpty())
-			return;
-		final double[] retV = result.getDenseBlockValues();
-		final int nColRet = result.getNumColumns();
-		if(tmp.isInSparseFormat()) {
-			final SparseBlock sb = tmp.getSparseBlock();
-			for(int row = rl, offT = 0; row < ru; row++, offT++) {
-				final int apos = sb.pos(offT);
-				final int alen = sb.size(offT);
-				final int[] aix = sb.indexes(offT);
-				final double[] avals = sb.values(offT);
-				final int offR = row * nColRet;
-				for(int i = apos; i < apos + alen; i++)
-					retV[offR + _colIndexes[aix[i]]] += avals[i];
-			}
-		}
-		else {
-			final double[] tmpV = tmp.getDenseBlockValues();
-			final int nCol = _colIndexes.length;
-			for(int row = rl, offT = 0; row < ru; row++, offT += nCol) {
-				final int offR = row * nColRet;
-				for(int col = 0; col < nCol; col++)
-					retV[offR + _colIndexes[col]] += tmpV[offT + col];
-			}
-		}
 	}
 
 	@Override
@@ -553,9 +487,8 @@ public abstract class AColGroupValue extends AColGroupCompressed implements Clon
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		sb.append(" Is Lossy: " + _dict.isLossy() + " num Rows: " + _numRows + " contain zero row:" + _zeros);
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Values: " + _dict.getClass().getSimpleName()));
+		sb.append(String.format("\n%15s%s", "Values: " , _dict.getClass().getSimpleName()));
 		sb.append(_dict.getString(_colIndexes.length));
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
new file mode 100644
index 0000000..26c055d
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AMorphingMMColGroup.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+/**
+ * Abstract class for column group types that do not perform matrix Multiplication, and decompression for performance
+ * reasons but instead transforms into another type of column group type to perform that operation.
+ */
+public abstract class AMorphingMMColGroup extends AColGroupValue {
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected AMorphingMMColGroup(int numRows) {
+		super(numRows);
+	}
+
+	/**
+	 * A Abstract class for column groups that contain ADictionary for values.
+	 * 
+	 * @param colIndices   The Column indexes
+	 * @param numRows      The number of rows contained in this group
+	 * @param dict         The dictionary to contain the distinct tuples
+	 * @param cachedCounts The cached counts of the distinct tuples (can be null since it should be possible to
+	 *                     reconstruct the counts on demand)
+	 */
+	protected AMorphingMMColGroup(int[] colIndices, int numRows, ADictionary dict, int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
+	}
+
+	@Override
+	protected final void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		double[] values) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	public final void tsmmAColGroup(AColGroup other, MatrixBlock result) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	@Override
+	protected final void tsmm(double[] result, int numColumns, int nRows) {
+		throw new DMLCompressionException("This method should never be called");
+	}
+
+	public abstract AColGroup extractCommon(double[] constV);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
index 2a15a21..b65eaad 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/APreAgg.java
@@ -24,10 +24,10 @@ import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.compress.lib.CLALibLeftMultBy;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
@@ -38,13 +38,6 @@ public abstract class APreAgg extends AColGroupValue {
 
 	private static final long serialVersionUID = 3250955207277128281L;
 
-	private static ThreadLocal<double[]> tmpLeftMultDoubleArray = new ThreadLocal<double[]>() {
-		@Override
-		protected double[] initialValue() {
-			return null;
-		}
-	};
-
 	/**
 	 * Constructor for serialization
 	 * 
@@ -87,6 +80,7 @@ public abstract class APreAgg extends AColGroupValue {
 		else if(lhs instanceof APreAgg)
 			leftMultByColGroupValue((APreAgg) lhs, result);
 		else if(lhs instanceof ColGroupUncompressed)
+			// throw new NotImplementedException();
 			leftMultByUncompressedColGroup((ColGroupUncompressed) lhs, result);
 		else
 			throw new DMLCompressionException(
@@ -103,15 +97,21 @@ public abstract class APreAgg extends AColGroupValue {
 	 */
 	@Override
 	public final void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
+		// throw new NotImplementedException();
 		if(matrix.isEmpty())
 			return;
+		final int nCol = _colIndexes.length;
 		final int numVals = getNumValues();
 		// Pre aggregate the matrix into same size as dictionary
-		MatrixBlock preAgg = allocatePreAggregate(matrix, numVals, rl, ru);
+		final MatrixBlock preAgg = new MatrixBlock(ru - rl, numVals, false);
+		preAgg.allocateDenseBlock();
 		preAggregate(matrix, preAgg, rl, ru);
 		preAgg.recomputeNonZeros();
-		MatrixBlock tmpRes = leftMultByPreAggregateMatrix(preAgg);
-		addMatrixToResult(tmpRes, result, rl, ru);
+		final MatrixBlock tmpRes = new MatrixBlock(preAgg.getNumRows(), nCol, false);
+		forceMatrixBlockDictionary();
+		final MatrixBlock dictM = _dict.getMBDict(nCol).getMatrixBlock();
+		LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
+		CLALibLeftMultBy.addMatrixToResult(tmpRes, result, _colIndexes, rl, ru);
 	}
 
 	/**
@@ -150,6 +150,11 @@ public abstract class APreAgg extends AColGroupValue {
 
 	protected abstract boolean sameIndexStructure(AColGroupCompressed that);
 
+	public int getPreAggregateSize(){
+		return getNumValues();
+	}
+
+
 	private final ADictionary preAggLeft(APreAgg lhs) {
 		return lhs.preAggregateThatIndexStructure(this);
 	}
@@ -209,12 +214,15 @@ public abstract class APreAgg extends AColGroupValue {
 		if(lhs.getData().isEmpty())
 			return;
 		LOG.warn("Transpose of uncompressed to fit to template need t(a) %*% b support");
-		MatrixBlock tmp = LibMatrixReorg.transpose(lhs.getData(), InfrastructureAnalyzer.getLocalParallelism());
+		final MatrixBlock tmp = LibMatrixReorg.transpose(lhs.getData(), InfrastructureAnalyzer.getLocalParallelism());
 		final int numVals = getNumValues();
-		MatrixBlock preAgg = allocatePreAggregate(tmp, numVals, 0, tmp.getNumRows());
+		final MatrixBlock preAgg = new MatrixBlock(tmp.getNumRows(), numVals, false);
+		preAgg.allocateDenseBlock();
 		preAggregate(tmp, preAgg, 0, tmp.getNumRows());
 		preAgg.recomputeNonZeros();
-		MatrixBlock tmpRes = leftMultByPreAggregateMatrix(preAgg);
+		final MatrixBlock tmpRes = new MatrixBlock(preAgg.getNumRows(), _colIndexes.length, false);
+		final MatrixBlock dictM = _dict.getMBDict(getNumCols()).getMatrixBlock();
+		LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
 		addMatrixToResult(tmpRes, result, lhs._colIndexes);
 	}
 
@@ -267,24 +275,6 @@ public abstract class APreAgg extends AColGroupValue {
 		}
 	}
 
-	private final MatrixBlock leftMultByPreAggregateMatrix(MatrixBlock preAgg) {
-
-		// Allocate temporary matrix to multiply into.
-		final int tmpCol = _colIndexes.length;
-		final int tmpRow = preAgg.getNumRows();
-		double[] tmpLeftMultRes = tmpLeftMultDoubleArray.get();
-
-		MatrixBlock tmpRes = null;
-		if(tmpLeftMultRes != null && tmpLeftMultRes.length >= tmpCol * tmpRow) {
-			tmpRes = new MatrixBlock(tmpRow, tmpCol, new DenseBlockFP64(new int[] {tmpRow, tmpCol}, tmpLeftMultRes));
-			tmpRes.reset();
-		}
-		else
-			tmpRes = new MatrixBlock(tmpRow, tmpCol, false);
-
-		return leftMultByPreAggregateMatrix(preAgg, tmpRes);
-	}
-
 	private boolean shouldPreAggregateLeft(APreAgg lhs) {
 		final int nvL = lhs.getNumValues();
 		final int nvR = this.getNumValues();
@@ -295,13 +285,6 @@ public abstract class APreAgg extends AColGroupValue {
 		return costRightDense < costLeftDense;
 	}
 
-	private static MatrixBlock allocatePreAggregate(MatrixBlock m, int numVals, int rl, int ru) {
-		final int lhsRows = ru - rl;
-		final double[] vals = allocDVector(lhsRows * numVals, true);
-		final DenseBlock retB = new DenseBlockFP64(new int[] {lhsRows, numVals}, vals);
-		return new MatrixBlock(lhsRows, numVals, retB);
-	}
-
 	private static void MMDictsWithScaling(final ADictionary left, final ADictionary right, final int[] leftRows,
 		final int[] rightColumns, final MatrixBlock result, final int[] counts) {
 		LOG.warn("Inefficient double allocation of dictionary");
@@ -318,16 +301,12 @@ public abstract class APreAgg extends AColGroupValue {
 			if(mb.isEmpty())
 				return;
 			else if(mb.isInSparseFormat())
-				throw new NotImplementedException();
-			else {
-				final double[] values = mb.getDenseBlockValues();
-				MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
-			}
-		}
-		else {
-			final double[] values = dict.getValues();
-			MMDictsDenseDenseWithScaling(values, values, rows, cols, counts, ret);
+				TSMMDictsSparseWithScaling(mb.getSparseBlock(), rows, cols, counts, ret);
+			else
+				TSMMDictsDenseWithScaling(mb.getDenseBlockValues(), rows, cols, counts, ret);
 		}
+		else
+			TSMMDictsDenseWithScaling(dict.getValues(), rows, cols, counts, ret);
 	}
 
 	/**
@@ -416,9 +395,9 @@ public abstract class APreAgg extends AColGroupValue {
 		}
 	}
 
-	private static void MMDictsDenseDenseWithScaling(double[] left, double[] right, int[] rowsLeft, int[] colsRight,
-		int[] scaling, MatrixBlock result) {
-		final int commonDim = Math.min(left.length / rowsLeft.length, right.length / colsRight.length);
+	private static void TSMMDictsDenseWithScaling(double[] dv, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+		final int commonDim = Math.min(dv.length / rowsLeft.length, dv.length / colsRight.length);
 		final int resCols = result.getNumColumns();
 		final double[] resV = result.getDenseBlockValues();
 		for(int k = 0; k < commonDim; k++) {
@@ -427,10 +406,34 @@ public abstract class APreAgg extends AColGroupValue {
 			final int scale = scaling[k];
 			for(int i = 0; i < rowsLeft.length; i++) {
 				final int offOut = rowsLeft[i] * resCols;
-				final double vl = left[offL + i] * scale;
+				final double vl = dv[offL + i] * scale;
 				if(vl != 0)
 					for(int j = 0; j < colsRight.length; j++)
-						resV[offOut + colsRight[j]] += vl * right[offR + j];
+						resV[offOut + colsRight[j]] += vl * dv[offR + j];
+			}
+		}
+	}
+
+	private static void TSMMDictsSparseWithScaling(SparseBlock sb, int[] rowsLeft, int[] colsRight, int[] scaling,
+		MatrixBlock result) {
+
+		final int commonDim = sb.numRows();
+		final int resCols = result.getNumColumns();
+		final double[] resV = result.getDenseBlockValues();
+
+		for(int k = 0; k < commonDim; k++) {
+			if(sb.isEmpty(k))
+				continue;
+			final int apos = sb.pos(k);
+			final int alen = sb.size(k) + apos;
+			final int[] aix = sb.indexes(k);
+			final double[] avals = sb.values(k);
+			final int scale = scaling[k];
+			for(int i = apos; i < alen; i++) {
+				final double v = avals[i] * scale;
+				final int offOut = rowsLeft[aix[i]] * resCols;
+				for(int j = 0; j < alen; j++)
+					resV[offOut + colsRight[aix[j]]] += v * avals[j];
 			}
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index 86335b9..fbc510c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -54,21 +54,29 @@ public class ColGroupConst extends AColGroupCompressed {
 	 * @param colIndices The Colum indexes for the column group.
 	 * @param dict       The dictionary containing one tuple for the entire compression.
 	 */
-	protected ColGroupConst(int[] colIndices, ADictionary dict) {
+	private ColGroupConst(int[] colIndices, ADictionary dict) {
 		super(colIndices);
 		this._dict = dict;
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		for(int rix = rl; rix < ru; rix++)
-			c[rix] += vals;
+	/**
+	 * Create constructor for a ColGroup Const this constructor ensures that if the dictionary input is empty an Empty
+	 * column group is constructed.
+	 * 
+	 * @param colIndices The column indexes in the column group
+	 * @param dict       The dictionary to use
+	 * @return A Colgroup either const or empty.
+	 */
+	protected static AColGroup create(int[] colIndices, ADictionary dict) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupConst(colIndices, dict);
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		double value = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
+		double value = _dict.aggregateRows(builtin, _colIndexes.length)[0];
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], value);
 	}
@@ -108,19 +116,17 @@ public class ColGroupConst extends AColGroupCompressed {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupConst(_colIndexes, _dict.clone().inplaceScalarOp(op));
+		return create(_colIndexes, _dict.applyScalarOp(op));
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpLeft(op, v, _colIndexes));
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
-		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupConst(_colIndexes, ret);
+		return create(_colIndexes, _dict.binOpRight(op, v, _colIndexes));
 	}
 
 	/**
@@ -131,13 +137,12 @@ public class ColGroupConst extends AColGroupCompressed {
 	 */
 	public void addToCommon(double[] constV) {
 		final double[] values = _dict.getValues();
-		if(values != null && constV != null)
-			for(int i = 0; i < _colIndexes.length; i++)
-				constV[_colIndexes[i]] += values[i];
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += values[i];
 	}
 
 	public double[] getValues() {
-		return _dict != null ? _dict.getValues() : null;
+		return _dict.getValues();
 	}
 
 	@Override
@@ -151,17 +156,38 @@ public class ColGroupConst extends AColGroupCompressed {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
-		if(_dict != null)
-			if(square)
-				c[0] += _dict.sumsq(new int[] {nRows}, _colIndexes.length);
-			else
-				c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	protected void computeSum(double[] c, int nRows) {
+		c[0] += _dict.sum(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		_dict.colSum(c, new int[] {nRows}, _colIndexes);
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+
+		c[0] += _dict.sumSq(new int[] {nRows}, _colIndexes.length);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, new int[] {nRows}, _colIndexes);
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
-		_dict.colSum(c, new int[] {nRows}, _colIndexes, square);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals;
 	}
 
 	@Override
@@ -183,11 +209,13 @@ public class ColGroupConst extends AColGroupCompressed {
 		final int cr = right.getNumColumns();
 		if(_colIndexes.length == rr) {
 			MatrixBlock left = forceValuesToMatrixBlock();
+			if(left.isEmpty())
+				return null;
 			MatrixBlock ret = new MatrixBlock(1, cr, false);
 			LibMatrixMult.matrixMult(left, right, ret);
-			ADictionary d = new MatrixBlockDictionary(ret);
 			if(ret.isEmpty())
 				return null;
+			ADictionary d = new MatrixBlockDictionary(ret);
 			return ColGroupFactory.genColGroupConst(cr, d);
 		}
 		else {
@@ -202,7 +230,7 @@ public class ColGroupConst extends AColGroupCompressed {
 
 	@Override
 	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		throw new NotImplementedException();
+		throw new DMLCompressionException("Should not be called");
 	}
 
 	@Override
@@ -223,19 +251,19 @@ public class ColGroupConst extends AColGroupCompressed {
 			return new ColGroupEmpty(colIndexes);
 		else {
 			ADictionary retD = new Dictionary(new double[] {_dict.getValue(idx)});
-			return new ColGroupConst(colIndexes, retD);
+			return create(colIndexes, retD);
 		}
 	}
 
 	@Override
 	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
 		ADictionary retD = _dict.sliceOutColumnRange(idStart, idEnd, _colIndexes.length);
-		return new ColGroupConst(outputCols, retD);
+		return create(outputCols, retD);
 	}
 
 	@Override
 	public AColGroup copy() {
-		return new ColGroupConst(_colIndexes, _dict.clone());
+		return create(_colIndexes, _dict.clone());
 	}
 
 	@Override
@@ -251,7 +279,7 @@ public class ColGroupConst extends AColGroupCompressed {
 	@Override
 	public AColGroup replace(double pattern, double replace) {
 		ADictionary replaced = _dict.replace(pattern, replace, _colIndexes.length);
-		return new ColGroupConst(_colIndexes, replaced);
+		return create(_colIndexes, replaced);
 	}
 
 	@Override
@@ -269,9 +297,7 @@ public class ColGroupConst extends AColGroupCompressed {
 	@Override
 	public long getExactSizeOnDisk() {
 		long ret = super.getExactSizeOnDisk();
-		if(_dict != null)
-			ret += _dict.getExactSizeOnDisk();
-
+		ret += _dict.getExactSizeOnDisk();
 		return ret;
 	}
 
@@ -279,7 +305,7 @@ public class ColGroupConst extends AColGroupCompressed {
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Values: " + _dict.getClass().getSimpleName()));
+		sb.append(String.format("\n%15s", "Values: " + _dict.getClass().getSimpleName()));
 		sb.append(_dict.getString(_colIndexes.length));
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index b6d4231..1651a8b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -67,26 +67,13 @@ public class ColGroupDDC extends APreAgg {
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
 		throw new NotImplementedException();
-		// for(int i = rl; i < ru; i++, offT++) {
-		// final int rowIndex = _data.getIndex(i);
-		// if(sb.isEmpty(rowIndex))
-		// continue;
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// final int apos = sb.pos(rowIndex);
-		// final int alen = sb.size(rowIndex) + apos;
-		// final double[] avals = sb.values(rowIndex);
-		// final int[] aix = sb.indexes(rowIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
 	}
 
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
 		final int nCol = _colIndexes.length;
-		for(int i = rl,offT = rl + offR; i < ru; i++, offT++) {
+		for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
 			final double[] c = db.values(offT);
 			final int off = db.pos(offT) + offC;
 			final int rowIndex = _data.getIndex(i) * nCol;
@@ -118,8 +105,15 @@ public class ColGroupDDC extends APreAgg {
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += vals[_data.getIndex(rix)];
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
 		for(int rix = rl; rix < ru; rix++)
 			c[rix] += vals[_data.getIndex(rix)];
 	}
@@ -127,17 +121,15 @@ public class ColGroupDDC extends APreAgg {
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		final int nCol = getNumCols();
-		double[] preAggregatedRows = _dict.aggregateTuples(builtin, nCol);
+		double[] preAggregatedRows = _dict.aggregateRows(builtin, nCol);
 		for(int i = rl; i < ru; i++)
 			c[i] = builtin.execute(c[i], preAggregatedRows[_data.getIndex(i)]);
 	}
 
 	@Override
 	public int[] getCounts(int[] counts) {
-		for(int i = 0; i < _numRows; i++) {
-			int index = _data.getIndex(i);
-			counts[index]++;
-		}
+		for(int i = 0; i < _numRows; i++)
+			counts[_data.getIndex(i)]++;
 		return counts;
 	}
 
@@ -151,7 +143,7 @@ public class ColGroupDDC extends APreAgg {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		_data.preAggregateDense(m, preAgg, rl, ru, cl, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
@@ -181,11 +173,14 @@ public class ColGroupDDC extends APreAgg {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
-			final int fr = that._data.getIndex(itThat.getDataIndexAndIncrement());
+			final int fr = that._data.getIndex(itThat.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
+			itThat.next();
 		}
 	}
 
@@ -193,9 +188,12 @@ public class ColGroupDDC extends APreAgg {
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
+		final int finalOff = that._indexes.getOffsetToLast();
+		while(true) {
 			final int to = _data.getIndex(itThat.value());
 			that._dict.addToEntry(ret, 0, to, nCol);
+			if(itThat.value() == finalOff)
+				break;
 			itThat.next();
 		}
 	}
@@ -219,7 +217,7 @@ public class ColGroupDDC extends APreAgg {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupDDC(_colIndexes, _numRows, applyScalarOp(op), _data, getCachedCounts());
+		return new ColGroupDDC(_colIndexes, _numRows, _dict.applyScalarOp(op), _data, getCachedCounts());
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index ec20674..a75f046 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.runtime.compress.colgroup;
 
+import java.util.Arrays;
+
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -69,7 +71,7 @@ public class ColGroupEmpty extends AColGroupCompressed {
 	}
 
 	@Override
-	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC){
+	public void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) {
 		// do nothing.
 	}
 
@@ -80,10 +82,12 @@ public class ColGroupEmpty extends AColGroupCompressed {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		double val0 = op.executeScalar(0);
-		if(val0 == 0)
+		final double v = op.executeScalar(0);
+		if(v == 0)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(new double[_colIndexes.length]).inplaceScalarOp(op));
+		double[] retV = new double[_colIndexes.length];
+		Arrays.fill(retV, v);
+		return ColGroupConst.create(_colIndexes, new Dictionary(retV));
 	}
 
 	@Override
@@ -99,7 +103,7 @@ public class ColGroupEmpty extends AColGroupCompressed {
 
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -111,10 +115,10 @@ public class ColGroupEmpty extends AColGroupCompressed {
 		final int lenV = _colIndexes.length;
 		boolean allZero = true;
 		for(int i = 0; i < lenV; i++)
-			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero ;
+			allZero = 0 == (retVals[i] = fn.execute(0, v[_colIndexes[i]])) && allZero;
 		if(allZero)
 			return this;
-		return new ColGroupConst(_colIndexes, new Dictionary(retVals));
+		return ColGroupConst.create(_colIndexes, new Dictionary(retVals));
 	}
 
 	@Override
@@ -186,11 +190,6 @@ public class ColGroupEmpty extends AColGroupCompressed {
 	}
 
 	@Override
-	public void computeColSums(double[] c, int nRows) {
-		// do nothing
-	}
-
-	@Override
 	protected double computeMxx(double c, Builtin builtin) {
 		return builtin.execute(c, 0);
 	}
@@ -202,17 +201,32 @@ public class ColGroupEmpty extends AColGroupCompressed {
 	}
 
 	@Override
-	protected void computeSum(double[] c, int nRows, boolean square) {
+	protected void computeSum(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// do nothing
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		// do nothing
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
 		// do nothing
 	}
 
 	@Override
-	protected void computeColSums(double[] c, int nRows, boolean square) {
+	protected void computeColSumsSq(double[] c, int nRows) {
 		// do nothing
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index fc0edf6..74b5c1e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -147,7 +147,7 @@ public class ColGroupFactory {
 		if(cols.length != values.length)
 			throw new DMLCompressionException("Invalid size of values compared to columns");
 		ADictionary dict = new Dictionary(values);
-		return new ColGroupConst(cols, dict);
+		return ColGroupConst.create(cols, dict);
 	}
 
 	/**
@@ -162,7 +162,7 @@ public class ColGroupFactory {
 			throw new DMLCompressionException(
 				"Invalid construction of const column group with different number of columns in arguments");
 		final int[] colIndices = Util.genColsIndices(numCols);
-		return new ColGroupConst(colIndices, dict);
+		return ColGroupConst.create(colIndices, dict);
 	}
 
 	private static List<AColGroup> genEmpty(MatrixBlock in, CompressionSettings compSettings) {
@@ -194,7 +194,7 @@ public class ColGroupFactory {
 				if(!tg.isEmpty())
 					tasks.add(new CompressTask(in, tg, compSettings, Math.max(1, k / 2)));
 
-			List<AColGroup> ret = new ArrayList<>(csi.getNumberColGroups());
+			List<AColGroup> ret = new ArrayList<>();
 			for(Future<Collection<AColGroup>> t : pool.invokeAll(tasks))
 				ret.addAll(t.get());
 			pool.shutdown();
@@ -234,11 +234,17 @@ public class ColGroupFactory {
 
 		@Override
 		public Collection<AColGroup> call() {
-			ArrayList<AColGroup> res = new ArrayList<>();
-			Tmp tmpMap = new Tmp();
-			for(CompressedSizeInfoColGroup g : _groups)
-				res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
-			return res;
+			try {
+				ArrayList<AColGroup> res = new ArrayList<>();
+				Tmp tmpMap = new Tmp();
+				for(CompressedSizeInfoColGroup g : _groups)
+					res.addAll(compressColGroup(_in, _compSettings, tmpMap, g, _k));
+				return res;
+			}
+			catch(Exception e) {
+				e.printStackTrace();
+				throw e;
+			}
 		}
 	}
 
@@ -347,7 +353,7 @@ public class ColGroupFactory {
 
 		final IntArrayList[] of = ubm.getOffsetList();
 		if(of.length == 1 && of[0].size() == rlen) // If this always constant
-			return new ColGroupConst(colIndexes, DictionaryFactory.create(ubm));
+			return ColGroupConst.create(colIndexes, DictionaryFactory.create(ubm));
 
 		switch(compType) {
 			case DDC:
@@ -369,7 +375,7 @@ public class ColGroupFactory {
 		CompressedSizeInfoColGroup cg, int k) {
 		final int rlen = cs.transposed ? raw.getNumColumns() : raw.getNumRows();
 		// use a Map that is at least char size.
-		final int nVal = Math.max(cg.getNumVals(), 257);
+		final int nVal = cg.getNumVals() < 16 ? 16 : Math.max(cg.getNumVals(), 257);
 		return directCompressDDC(colIndexes, raw, cs, cg, MapToFactory.create(rlen, nVal), rlen, k);
 	}
 
@@ -379,70 +385,82 @@ public class ColGroupFactory {
 		data.fill(fill);
 
 		DblArrayCountHashMap map = new DblArrayCountHashMap(cg.getNumVals());
-
+		boolean extra;
 		if(rlen < CompressionSettings.PAR_DDC_THRESHOLD || k == 1)
-			readToMapDDC(colIndexes, raw, map, cs, data, 0, rlen);
+			extra = readToMapDDC(colIndexes, raw, map, cs, data, 0, rlen, fill);
 		else
-			parallelReadToMapDDC(colIndexes, raw, map, cs, data, rlen, k);
-
-		boolean extra = false;
-		for(int i = 0; i < rlen; i++)
-			if(data.getIndex(i) == fill) {
-				extra = true;
-				break;
-			}
+			extra = parallelReadToMapDDC(colIndexes, raw, map, cs, data, rlen, fill, k);
 
 		if(map.size() == 0)
 			// If the column was empty.
 			// This is highly unlikely but could happen if forced compression of
 			// not transposed column and the estimator says use DDC.
 			return new ColGroupEmpty(colIndexes);
-
 		ADictionary dict = DictionaryFactory.create(map, colIndexes.length, extra);
-		if(extra)
+		if(extra) {
 			data.replace(fill, map.size());
+			data.setUnique(map.size() + 1);
+		}
+		else
+			data.setUnique(map.size());
 
 		AMapToData resData = MapToFactory.resize(data, map.size() + (extra ? 1 : 0));
 		ColGroupDDC res = new ColGroupDDC(colIndexes, rlen, dict, resData, null);
 		return res;
 	}
 
-	private static void readToMapDDC(final int[] colIndexes, final MatrixBlock raw, final DblArrayCountHashMap map,
-		final CompressionSettings cs, final AMapToData data, final int rl, final int ru) {
+	private static boolean readToMapDDC(final int[] colIndexes, final MatrixBlock raw, final DblArrayCountHashMap map,
+		final CompressionSettings cs, final AMapToData data, final int rl, final int ru, final int fill) {
 		ReaderColumnSelection reader = ReaderColumnSelection.createReader(raw, colIndexes, cs.transposed, rl, ru);
-		DblArray cellVals = null;
-		while((cellVals = reader.nextRow()) != null) {
-			final int id = map.increment(cellVals);
+		DblArray cellVals = reader.nextRow();
+		boolean extra = false;
+		int r = rl;
+		while(r < ru && cellVals != null) {
 			final int row = reader.getCurrentRowIndex();
-			data.set(row, id);
+			if(row == r) {
+				final int id = map.increment(cellVals);
+				data.set(row, id);
+				cellVals = reader.nextRow();
+				r++;
+			}
+			else {
+				r = row;
+				extra = true;
+			}
 		}
+
+		if(r < ru)
+			extra = true;
+
+		return extra;
 	}
 
-	private static void parallelReadToMapDDC(final int[] colIndexes, final MatrixBlock raw,
+	private static boolean parallelReadToMapDDC(final int[] colIndexes, final MatrixBlock raw,
 		final DblArrayCountHashMap map, final CompressionSettings cs, final AMapToData data, final int rlen,
-		final int k) {
+		final int fill, final int k) {
 
 		try {
-			final int blk = Math.max(rlen / colIndexes.length / k, 128000 / colIndexes.length);
+			final int blk = Math.max(rlen / colIndexes.length / k, 64000 / colIndexes.length);
 			ExecutorService pool = CommonThreadPool.get(Math.min(Math.max(rlen / blk, 1), k));
 			List<readToMapDDCTask> tasks = new ArrayList<>();
 
 			for(int i = 0; i < rlen; i += blk) {
 				int end = Math.min(rlen, i + blk);
-				tasks.add(new readToMapDDCTask(colIndexes, raw, map, cs, data, i, end));
+				tasks.add(new readToMapDDCTask(colIndexes, raw, map, cs, data, i, end, fill));
 			}
-
-			for(Future<Object> t : pool.invokeAll(tasks))
-				t.get();
+			boolean extra = false;
+			for(Future<Boolean> t : pool.invokeAll(tasks))
+				extra |= t.get();
 
 			pool.shutdown();
+			return extra;
 		}
 		catch(Exception e) {
 			throw new DMLRuntimeException("Failed to parallelize DDC compression");
 		}
 	}
 
-	static class readToMapDDCTask implements Callable<Object> {
+	static class readToMapDDCTask implements Callable<Boolean> {
 		private final int[] _colIndexes;
 		private final MatrixBlock _raw;
 		private final DblArrayCountHashMap _map;
@@ -450,9 +468,10 @@ public class ColGroupFactory {
 		private final AMapToData _data;
 		private final int _rl;
 		private final int _ru;
+		private final int _fill;
 
 		protected readToMapDDCTask(int[] colIndexes, MatrixBlock raw, DblArrayCountHashMap map, CompressionSettings cs,
-			AMapToData data, int rl, int ru) {
+			AMapToData data, int rl, int ru, int fill) {
 			_colIndexes = colIndexes;
 			_raw = raw;
 			_map = map;
@@ -460,12 +479,12 @@ public class ColGroupFactory {
 			_data = data;
 			_rl = rl;
 			_ru = ru;
+			_fill = fill;
 		}
 
 		@Override
-		public Collection<AColGroup> call() {
-			readToMapDDC(_colIndexes, _raw, _map, _cs, _data, _rl, _ru);
-			return null;
+		public Boolean call() {
+			return new Boolean(readToMapDDC(_colIndexes, _raw, _map, _cs, _data, _rl, _ru, _fill));
 		}
 	}
 
@@ -490,7 +509,7 @@ public class ColGroupFactory {
 		ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity);
 		if(ubm.getNumValues() == 1) {
 			if(numZeros >= largestOffset) {
-				final AOffset off = OffsetFactory.create(ubm.getOffsetList()[0].extractValues(true));
+				final AOffset off = OffsetFactory.createOffset(ubm.getOffsetList()[0].extractValues(true));
 				return new ColGroupSDCSingleZeros(colIndexes, rlen, dict, off, null);
 			}
 			else {
@@ -510,7 +529,7 @@ public class ColGroupFactory {
 		CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.create(rlen, offsets, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData data = s.getData();
 		int[] counts = new int[offsets.length + 1];
 		int sum = 0;
@@ -519,18 +538,17 @@ public class ColGroupFactory {
 			sum += counts[i];
 		}
 		counts[offsets.length] = rlen - sum;
-		AColGroupValue ret = new ColGroupSDCZeros(colIndexes, rlen, dict, indexes, data, counts);
-		return ret;
+		return ColGroupSDCZeros.create(colIndexes, rlen, dict, indexes, data, counts);
 	}
 
 	private static AColGroup setupMultiValueColGroup(int[] colIndexes, int numZeros, int rlen, ABitmap ubm,
 		int largestIndex, ADictionary dict, CompressionSettings cs) {
 		IntArrayList[] offsets = ubm.getOffsetList();
 		AInsertionSorter s = InsertionSorterFactory.createNegative(rlen, offsets, largestIndex, cs.sdcSortType);
-		AOffset indexes = OffsetFactory.create(s.getIndexes());
+		AOffset indexes = OffsetFactory.createOffset(s.getIndexes());
 		AMapToData _data = s.getData();
-		AColGroupValue ret = new ColGroupSDC(colIndexes, rlen, dict, indexes, _data, null);
-		return ret;
+		_data = MapToFactory.resize(_data, _data.getUnique() - 1);
+		return ColGroupSDC.create(colIndexes, rlen, dict, indexes, _data, null);
 	}
 
 	private static AColGroup setupSingleValueSDCColGroup(int[] colIndexes, int rlen, ABitmap ubm, ADictionary dict) {
@@ -548,7 +566,7 @@ public class ColGroupFactory {
 
 		while(v < rlen)
 			indexes[p++] = v++;
-		AOffset off = OffsetFactory.create(indexes);
+		AOffset off = OffsetFactory.createOffset(indexes);
 
 		return new ColGroupSDCSingle(colIndexes, rlen, dict, off, null);
 	}
@@ -635,14 +653,14 @@ public class ColGroupFactory {
 			}
 
 			counts[entries.size()] = rlen - sum;
-			final AOffset offsets = OffsetFactory.create(sb.indexes(sbRow), apos, alen);
+			final AOffset offsets = OffsetFactory.createOffset(sb.indexes(sbRow), apos, alen);
 			if(entries.size() <= 1)
 				return new ColGroupSDCSingleZeros(cols, rlen, new Dictionary(dict), offsets, counts);
 			else {
 				final AMapToData mapToData = MapToFactory.create((alen - apos), entries.size());
 				for(int j = apos; j < alen; j++)
 					mapToData.set(j - apos, map.get(vals[j]));
-				return new ColGroupSDCZeros(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
+				return ColGroupSDCZeros.create(cols, rlen, new Dictionary(dict), offsets, mapToData, counts);
 			}
 		}
 		else {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
index f8edbcb..184ca1a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
@@ -118,6 +118,8 @@ public class ColGroupIO {
 				return new ColGroupSDCSingleZeros(nRows);
 			case SDCZeros:
 				return new ColGroupSDCZeros(nRows);
+			case PFOR:
+				return new ColGroupPFOR(nRows);
 			default:
 				throw new DMLRuntimeException("Unsupported ColGroup Type used:  " + ctype);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index a303d98..285c710 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -23,7 +23,6 @@ import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -66,7 +65,8 @@ public class ColGroupOLE extends AColGroupOffset {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, double[] values) {
+	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
 		throw new NotImplementedException();
 		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		// final int numCols = getNumCols();
@@ -79,33 +79,34 @@ public class ColGroupOLE extends AColGroupOffset {
 		// double[] c = target.getDenseBlockValues();
 		// // cache conscious append via horizontal scans
 		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz) {
-		// 	for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-		// 		int boff = _ptr[k];
-		// 		int blen = len(k);
-		// 		int bix = apos[k];
-
-		// 		if(bix >= blen)
-		// 			continue;
-		// 		int pos = boff + bix;
-		// 		int len = _data[pos];
-		// 		int i = 1;
-		// 		int row = bi + _data[pos + 1];
-		// 		while(i <= len && row < rl)
-		// 			row = bi + _data[pos + i++];
-
-		// 		for(; i <= len && row < ru; i++) {
-		// 			row = bi + _data[pos + i];
-		// 			int rc = (row - offOut) * targetCols;
-		// 			for(int j = 0; j < numCols; j++)
-		// 				c[rc + _colIndexes[j]] += values[off + j];
-		// 		}
-		// 		apos[k] += len + 1;
-		// 	}
+		// for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// int bix = apos[k];
+
+		// if(bix >= blen)
+		// continue;
+		// int pos = boff + bix;
+		// int len = _data[pos];
+		// int i = 1;
+		// int row = bi + _data[pos + 1];
+		// while(i <= len && row < rl)
+		// row = bi + _data[pos + i++];
+
+		// for(; i <= len && row < ru; i++) {
+		// row = bi + _data[pos + i];
+		// int rc = (row - offOut) * targetCols;
+		// for(int j = 0; j < numCols; j++)
+		// c[rc + _colIndexes[j]] += values[off + j];
+		// }
+		// apos[k] += len + 1;
+		// }
 		// }
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC, SparseBlock values) {
+	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock values) {
 		throw new NotImplementedException();
 	}
 
@@ -148,7 +149,7 @@ public class ColGroupOLE extends AColGroupOffset {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupOLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
 		// note: for efficiency, we currently don't drop values that become 0
@@ -156,10 +157,10 @@ public class ColGroupOLE extends AColGroupOffset {
 		int[] loff = computeOffsets(lind);
 
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupOLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupOLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genOffsetBitmap(loff, loff.length);
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
 		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
@@ -216,69 +217,74 @@ public class ColGroupOLE extends AColGroupOffset {
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+		// final int numVals = getNumValues();
 
-		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > blksz) {
+		// final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
 
-		if(numVals > 1 && _numRows > blksz) {
-			final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-			int[] apos = skipScan(numVals, rl);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious row sums via horizontal scans
-			for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
-				int bimax = Math.min(bi + blksz2, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-
-					for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
-						// prepare length, start, and end pos
-						int len = _data[boff + bix];
-
-						// compute partial results
-						for(int i = 1; i <= len; i++) {
-							int rix = ii + _data[boff + bix + i];
-							if(rix >= _numRows)
-								throw new DMLCompressionException("Invalid row " + rix);
-							c[rix] += val;
-						}
-						bix += len + 1;
-					}
+		// // step 1: prepare position and value arrays
+		// int[] apos = skipScan(numVals, rl);
+		// double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// // step 2: cache conscious row sums via horizontal scans
+		// for(int bi = (rl / blksz) * blksz; bi < ru; bi += blksz2) {
+		// int bimax = Math.min(bi + blksz2, ru);
+
+		// // horizontal segment scan, incl pos maintenance
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = aval[k];
+		// int bix = apos[k];
+
+		// for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
+		// // prepare length, start, and end pos
+		// int len = _data[boff + bix];
+
+		// // compute partial results
+		// for(int i = 1; i <= len; i++) {
+		// int rix = ii + _data[boff + bix + i];
+		// if(rix >= _numRows)
+		// throw new DMLCompressionException("Invalid row " + rix);
+		// c[rix] += val;
+		// }
+		// bix += len + 1;
+		// }
 
-					apos[k] = bix;
-				}
-			}
-		}
-		else {
-			// iterate over all values and their bitmaps
-			for(int k = 0; k < numVals; k++) {
-				// prepare value-to-add for entire value bitmap
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
+		// apos[k] = bix;
+		// }
+		// }
+		// }
+		// else {
+		// // iterate over all values and their bitmaps
+		// for(int k = 0; k < numVals; k++) {
+		// // prepare value-to-add for entire value bitmap
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// // iterate over bitmap blocks and add values
+		// if(val != 0) {
+		// int slen;
+		// int bix = skipScanVal(k, rl);
+		// for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
+		// slen = _data[boff + bix];
+		// for(int i = 1; i <= slen; i++) {
+		// int rix = off + _data[boff + bix + i];
+		// c[rix] += val;
+		// }
+		// }
+		// }
+		// }
+		// }
+	}
 
-				// iterate over bitmap blocks and add values
-				if(val != 0) {
-					int slen;
-					int bix = skipScanVal(k, rl);
-					for(int off = ((rl + 1) / blksz) * blksz; bix < blen && off < ru; bix += slen + 1, off += blksz) {
-						slen = _data[boff + bix];
-						for(int i = 1; i <= slen; i++) {
-							int rix = off + _data[boff + bix + i];
-							c[rix] += val;
-						}
-					}
-				}
-			}
-		}
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -413,7 +419,7 @@ public class ColGroupOLE extends AColGroupOffset {
 	private int[] skipScan(int numVals, int rl) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		rl = (rl / blksz) * blksz;
-		int[] ret = allocIVector(numVals, rl == 0);
+		int[] ret = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
@@ -467,7 +473,7 @@ public class ColGroupOLE extends AColGroupOffset {
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s%5d ", "Data:", this._data.length));
+		sb.append(String.format("\n%15s%5d", "Data:", this._data.length));
 		sb.append(charsToString(_data));
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
new file mode 100644
index 0000000..99fa68e
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupPFOR.java
@@ -0,0 +1,384 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Divide;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * ColGroup for Patched Frame Of Reference.
+ * 
+ * This column group fits perfectly into the collection of compression groups
+ * 
+ * It can be constructed when a SDCZeros group get a non zero default value. Then a natural extension is to transform
+ * the group into a PFOR group, since the default value is then treated as an offset, and the dictionary can be copied
+ * with no modifications.
+ * 
+ */
+public class ColGroupPFOR extends AMorphingMMColGroup {
+
+	private static final long serialVersionUID = 3883228464052204203L;
+
+	/** Sparse row indexes for the data that is nonZero */
+	protected AOffset _indexes;
+
+	/** Pointers to row indexes in the dictionary. */
+	protected transient AMapToData _data;
+
+	/** Reference values in this column group */
+	protected double[] _reference;
+
+	/**
+	 * Constructor for serialization
+	 * 
+	 * @param numRows Number of rows contained
+	 */
+	protected ColGroupPFOR(int numRows) {
+		super(numRows);
+	}
+
+	private ColGroupPFOR(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		super(colIndices, numRows, dict, cachedCounts);
+		_data = data;
+		_indexes = indexes;
+		_zeros = allZero(reference);
+		_reference = reference;
+	}
+
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset indexes, AMapToData data,
+		int[] cachedCounts, double[] reference) {
+		if(dict == null) {
+			// either ColGroupEmpty or const
+			boolean allZero = true;
+			for(double d : reference)
+				if(d != 0) {
+					allZero = false;
+					break;
+				}
+
+			if(allZero)
+				return new ColGroupEmpty(colIndices);
+			else
+				return ColGroupFactory.genColGroupConst(colIndices, reference);
+		}
+		return new ColGroupPFOR(colIndices, numRows, dict, indexes, data, cachedCounts, reference);
+	}
+
+	private final static boolean allZero(double[] in) {
+		for(double v : in)
+			if(v != 0)
+				return false;
+		return true;
+	}
+
+	@Override
+	public CompressionType getCompType() {
+		return CompressionType.PFOR;
+	}
+
+	@Override
+	public ColGroupType getColGroupType() {
+		return ColGroupType.PFOR;
+	}
+
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		// Add reference value sum.
+		final double refSum = refSum();
+		for(int rix = rl; rix < ru; rix++)
+			c[rix] += refSum;
+
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		ColGroupSDCZeros.computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	private final double refSum() {
+		double ret = 0;
+		for(double d : _reference)
+			ret += d;
+		return ret;
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_reference);
+		ColGroupSDC.computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _reference);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
+
+	@Override
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
+		final int nCol = _colIndexes.length;
+		if(it == null || it.value() != r)
+			return _reference[colIdx];
+		final int rowOff = _data.getIndex(it.getDataIndex()) * nCol;
+		return _dict.getValue(rowOff + colIdx) + _reference[colIdx];
+	}
+
+	@Override
+	public AColGroup scalarOperation(ScalarOperator op) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.executeScalar(_reference[i]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus) {
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.applyScalarOp(op);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.applyScalarOp(op, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(v[_colIndexes[i]], _reference[i]);
+
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpLeft(op, v, _colIndexes, _reference, newRef);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
+		final double[] newRef = new double[_reference.length];
+		for(int i = 0; i < _reference.length; i++)
+			newRef[i] = op.fn.execute(_reference[i], v[_colIndexes[i]]);
+		if(op.fn instanceof Plus || op.fn instanceof Minus)
+			return new ColGroupPFOR(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), newRef);
+		else if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+		else {
+			final ADictionary newDict = _dict.binOpRight(op, v, _colIndexes, _reference, newRef);
+			return new ColGroupPFOR(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), newRef);
+		}
+	}
+
+	@Override
+	public void write(DataOutput out) throws IOException {
+		super.write(out);
+		_indexes.write(out);
+		_data.write(out);
+		for(double d : _reference)
+			out.writeDouble(d);
+	}
+
+	@Override
+	public void readFields(DataInput in) throws IOException {
+		super.readFields(in);
+		_indexes = OffsetFactory.readIn(in);
+		_data = MapToFactory.readIn(in);
+		_reference = new double[_colIndexes.length];
+		for(int i = 0; i < _colIndexes.length; i++)
+			_reference[i] = in.readDouble();
+	}
+
+	@Override
+	public long getExactSizeOnDisk() {
+		long ret = super.getExactSizeOnDisk();
+		ret += _data.getExactSizeOnDisk();
+		ret += _indexes.getExactSizeOnDisk();
+		ret += 8 * _colIndexes.length; // reference values.
+		return ret;
+	}
+
+	@Override
+	public AColGroup replace(double pattern, double replace) {
+		boolean patternInReference = false;
+		for(double d : _reference)
+			if(pattern == d) {
+				patternInReference = true;
+				break;
+			}
+
+		if(patternInReference) {
+			throw new NotImplementedException("Not Implemented replace where a value in reference should be replaced");
+			// _dict.replace(pattern, replace, _reference, _newReplace);
+		}
+		else {
+			final ADictionary newDict = _dict.replace(pattern, replace, _reference);
+			return create(_colIndexes, _numRows, newDict, _indexes, _data, getCachedCounts(), _reference);
+		}
+
+	}
+
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append(super.toString());
+		sb.append(String.format("\n%15s", "Indexes: "));
+		sb.append(_indexes.toString());
+		sb.append(String.format("\n%15s", "Data: "));
+		sb.append(_data);
+		sb.append(String.format("\n%15s", "Reference:"));
+		sb.append(Arrays.toString(_reference));
+		return sb.toString();
+	}
+
+	@Override
+	protected double computeMxx(double c, Builtin builtin) {
+		return _dict.aggregate(c, builtin, _reference);
+	}
+
+	@Override
+	protected void computeColMxx(double[] c, Builtin builtin) {
+		_dict.aggregateCols(c, builtin, _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeSum(double[] c, int nRows) {
+		super.computeSum(c, nRows);
+		final double refSum = refSum();
+		c[0] += refSum * nRows;
+	}
+
+	@Override
+	public void computeColSums(double[] c, int nRows) {
+		super.computeColSums(c, nRows);
+		for(int i = 0; i < _colIndexes.length; i++)
+			c[_colIndexes[i]] += _reference[i] * nRows;
+	}
+
+	@Override
+	protected void computeSumSq(double[] c, int nRows) {
+		c[0] += _dict.sumSq(getCounts(), _reference);
+	}
+
+	@Override
+	protected void computeColSumsSq(double[] c, int nRows) {
+		_dict.colSumSq(c, getCounts(), _colIndexes, _reference);
+	}
+
+	@Override
+	protected void computeProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeRowProduct(double[] c, int rl, int ru) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected void computeColProduct(double[] c, int nRows) {
+		throw new NotImplementedException("Not Implemented PFOR");
+	}
+
+	@Override
+	protected AColGroup sliceSingleColumn(int idx) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceSingleColumn(idx);
+		// select values from double array.
+		ret._reference = new double[1];
+		ret._reference[0] = _reference[idx];
+		return ret;
+	}
+
+	@Override
+	protected AColGroup sliceMultiColumns(int idStart, int idEnd, int[] outputCols) {
+		ColGroupPFOR ret = (ColGroupPFOR) super.sliceMultiColumns(idStart, idEnd, outputCols);
+		final int len = idEnd - idStart;
+		ret._reference = new double[len];
+		for(int i = 0, ii = idStart; i < len; i++, ii++)
+			ret._reference[i] = _reference[ii];
+
+		return ret;
+	}
+
+	@Override
+	public boolean containsValue(double pattern) {
+		if(pattern == 0 && _zeros)
+			return true;
+		else if(Double.isNaN(pattern) || Double.isInfinite(pattern))
+			return containsInfOrNan(pattern) || _dict.containsValue(pattern);
+		else
+			return _dict.containsValue(pattern, _reference);
+	}
+
+	private boolean containsInfOrNan(double pattern) {
+		if(Double.isNaN(pattern)) {
+			for(double d : _reference)
+				if(Double.isNaN(d))
+					return true;
+			return false;
+		}
+		else {
+			for(double d : _reference)
+				if(Double.isInfinite(d))
+					return true;
+			return false;
+		}
+	}
+
+	@Override
+	public long getNumberNonZeros(int nRows) {
+		int[] counts = getCounts();
+		return (long) _dict.getNumberNonZeros(counts, _reference, nRows);
+	}
+
+	@Override
+	public AColGroup extractCommon(double[] constV) {
+		for(int i = 0; i < _colIndexes.length; i++)
+			constV[_colIndexes[i]] += _reference[i];
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 3d69b96..4a24a07 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -24,7 +24,6 @@ import java.util.Arrays;
 import java.util.List;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -65,7 +64,8 @@ public class ColGroupRLE extends AColGroupOffset {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock target, int rl, int ru, int offR, int offC, double[] values) {
+	protected void decompressToDenseBlockDenseDictionary(DenseBlock target, int rl, int ru, int offR, int offC,
+		double[] values) {
 		throw new NotImplementedException();
 		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		// final int numCols = getNumCols();
@@ -78,36 +78,36 @@ public class ColGroupRLE extends AColGroupOffset {
 		// double[] c = target.getDenseBlockValues();
 		// // cache conscious append via horizontal scans
 		// for(int bi = rl; bi < ru; bi += blksz) {
-		// 	int bimax = Math.min(bi + blksz, ru);
-		// 	for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
-		// 		int boff = _ptr[k];
-		// 		int blen = len(k);
-		// 		int bix = apos[k];
-		// 		int start = astart[k];
-		// 		for(; bix < blen & start < bimax; bix += 2) {
-		// 			start += _data[boff + bix];
-		// 			int len = _data[boff + bix + 1];
-		// 			for(int i = Math.max(rl, start) - (rl - offT); i < Math.min(start + len, ru) - (rl - offT); i++) {
-
-		// 				int rc = i * target.getNumColumns();
-		// 				for(int j = 0; j < numCols; j++)
-		// 					c[rc + _colIndexes[j]] += values[off + j];
-
-		// 			}
-		// 			start += len;
-		// 		}
-		// 		apos[k] = bix;
-		// 		astart[k] = start;
-		// 	}
+		// int bimax = Math.min(bi + blksz, ru);
+		// for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// int bix = apos[k];
+		// int start = astart[k];
+		// for(; bix < blen & start < bimax; bix += 2) {
+		// start += _data[boff + bix];
+		// int len = _data[boff + bix + 1];
+		// for(int i = Math.max(rl, start) - (rl - offT); i < Math.min(start + len, ru) - (rl - offT); i++) {
+
+		// int rc = i * target.getNumColumns();
+		// for(int j = 0; j < numCols; j++)
+		// c[rc + _colIndexes[j]] += values[off + j];
+
+		// }
+		// start += len;
+		// }
+		// apos[k] = bix;
+		// astart[k] = start;
+		// }
 		// }
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock target, int rl, int ru, int offR, int offC, SparseBlock values) {
+	protected void decompressToDenseBlockSparseDictionary(DenseBlock target, int rl, int ru, int offR, int offC,
+		SparseBlock values) {
 		throw new NotImplementedException();
 	}
 
-
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
@@ -146,7 +146,7 @@ public class ColGroupRLE extends AColGroupOffset {
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
 		if(op.sparseSafe || val0 == 0 || !_zeros) {
-			return new ColGroupRLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, _zeros, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
 		// slow path: sparse-unsafe operations (potentially create new bitmap)
@@ -154,10 +154,10 @@ public class ColGroupRLE extends AColGroupOffset {
 		boolean[] lind = computeZeroIndicatorVector();
 		int[] loff = computeOffsets(lind);
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupRLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr, getCachedCounts());
+			return new ColGroupRLE(_colIndexes, _numRows, false, _dict.applyScalarOp(op), _data, _ptr, getCachedCounts());
 		}
 
-		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = _dict.applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genRLEBitmap(loff, loff.length);
 
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
@@ -217,73 +217,143 @@ public class ColGroupRLE extends AColGroupOffset {
 	// }
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
 
-		final int numVals = getNumValues();
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 
-		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
-			final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-
-			// step 1: prepare position and value arrays
-
-			// current pos / values per RLE list
-			int[] astart = new int[numVals];
-			int[] apos = skipScan(numVals, rl, astart);
-			double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int bi = rl; bi < ru; bi += blksz) {
-				int bimax = Math.min(bi + blksz, ru);
-
-				// horizontal segment scan, incl pos maintenance
-				for(int k = 0; k < numVals; k++) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					double val = aval[k];
-					int bix = apos[k];
-					int start = astart[k];
-
-					// compute partial results, not aligned
-					while(bix < blen) {
-						int lstart = _data[boff + bix];
-						int llen = _data[boff + bix + 1];
-						int from = Math.max(bi, start + lstart);
-						int to = Math.min(start + lstart + llen, bimax);
-						for(int rix = from; rix < to; rix++)
-							c[rix] += val;
-
-						if(start + lstart + llen >= bimax)
-							break;
-						start += lstart + llen;
-						bix += 2;
-					}
-
-					apos[k] = bix;
-					astart[k] = start;
-				}
-			}
-		}
-		else {
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				double val = _dict.sumRow(k, square, _colIndexes.length);
-
-				if(val != 0.0) {
-					Pair<Integer, Integer> tmp = skipScanVal(k, rl);
-					int bix = tmp.getKey();
-					int curRunStartOff = tmp.getValue();
-					int curRunEnd = tmp.getValue();
-					for(; bix < blen && curRunEnd < ru; bix += 2) {
-						curRunStartOff = curRunEnd + _data[boff + bix];
-						curRunEnd = curRunStartOff + _data[boff + bix + 1];
-						for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
-							c[rix] += val;
-
-					}
-				}
-			}
-		}
+		// // step 1: prepare position and value arrays
+
+		// // current pos / values per RLE list
+		// int[] astart = new int[numVals];
+		// int[] apos = skipScan(numVals, rl, astart);
+		// double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// // step 2: cache conscious matrix-vector via horizontal scans
+		// for(int bi = rl; bi < ru; bi += blksz) {
+		// int bimax = Math.min(bi + blksz, ru);
+
+		// // horizontal segment scan, incl pos maintenance
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = aval[k];
+		// int bix = apos[k];
+		// int start = astart[k];
+
+		// // compute partial results, not aligned
+		// while(bix < blen) {
+		// int lstart = _data[boff + bix];
+		// int llen = _data[boff + bix + 1];
+		// int from = Math.max(bi, start + lstart);
+		// int to = Math.min(start + lstart + llen, bimax);
+		// for(int rix = from; rix < to; rix++)
+		// c[rix] += val;
+
+		// if(start + lstart + llen >= bimax)
+		// break;
+		// start += lstart + llen;
+		// bix += 2;
+		// }
+
+		// apos[k] = bix;
+		// astart[k] = start;
+		// }
+		// }
+		// }
+		// else {
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// if(val != 0.0) {
+		// Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// int bix = tmp.getKey();
+		// int curRunStartOff = tmp.getValue();
+		// int curRunEnd = tmp.getValue();
+		// for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// curRunStartOff = curRunEnd + _data[boff + bix];
+		// curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// c[rix] += val;
+
+		// }
+		// }
+		// }
+		// }
+	}
+
+	@Override
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		throw new NotImplementedException();
+		// final int numVals = getNumValues();
+
+		// if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		// final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+
+		// // step 1: prepare position and value arrays
+
+		// // current pos / values per RLE list
+		// int[] astart = new int[numVals];
+		// int[] apos = skipScan(numVals, rl, astart);
+		// double[] aval = _dict.sumAllRowsToDouble(square, _colIndexes.length);
+
+		// // step 2: cache conscious matrix-vector via horizontal scans
+		// for(int bi = rl; bi < ru; bi += blksz) {
+		// int bimax = Math.min(bi + blksz, ru);
+
+		// // horizontal segment scan, incl pos maintenance
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = aval[k];
+		// int bix = apos[k];
+		// int start = astart[k];
+
+		// // compute partial results, not aligned
+		// while(bix < blen) {
+		// int lstart = _data[boff + bix];
+		// int llen = _data[boff + bix + 1];
+		// int from = Math.max(bi, start + lstart);
+		// int to = Math.min(start + lstart + llen, bimax);
+		// for(int rix = from; rix < to; rix++)
+		// c[rix] += val;
+
+		// if(start + lstart + llen >= bimax)
+		// break;
+		// start += lstart + llen;
+		// bix += 2;
+		// }
+
+		// apos[k] = bix;
+		// astart[k] = start;
+		// }
+		// }
+		// }
+		// else {
+		// for(int k = 0; k < numVals; k++) {
+		// int boff = _ptr[k];
+		// int blen = len(k);
+		// double val = _dict.sumRow(k, square, _colIndexes.length);
+
+		// if(val != 0.0) {
+		// Pair<Integer, Integer> tmp = skipScanVal(k, rl);
+		// int bix = tmp.getKey();
+		// int curRunStartOff = tmp.getValue();
+		// int curRunEnd = tmp.getValue();
+		// for(; bix < blen && curRunEnd < ru; bix += 2) {
+		// curRunStartOff = curRunEnd + _data[boff + bix];
+		// curRunEnd = curRunStartOff + _data[boff + bix + 1];
+		// for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++)
+		// c[rix] += val;
+
+		// }
+		// }
+		// }
+		// }
 	}
 
 	@Override
@@ -395,7 +465,7 @@ public class ColGroupRLE extends AColGroupOffset {
 	 * @return array of positions for all values
 	 */
 	private int[] skipScan(int numVals, int rl, int[] astart) {
-		int[] apos = allocIVector(numVals, rl == 0);
+		int[] apos = new int[numVals];
 
 		if(rl > 0) { // rl aligned with blksz
 			for(int k = 0; k < numVals; k++) {
@@ -461,7 +531,7 @@ public class ColGroupRLE extends AColGroupOffset {
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s%5d ", "Data:", this._data.length));
+		sb.append(String.format("\n%15s%5d", "Data:", this._data.length));
 		sb.append("{");
 		sb.append(((int) _data[0]) + "-" + ((int) _data[1]));
 		for(int i = 2; i < _data.length; i += 2) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index fc011e0..fd94d0a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -23,18 +23,13 @@ import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -46,15 +41,12 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDC extends AColGroupValue {
+public class ColGroupSDC extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 769993538831949086L;
-	/**
-	 * Sparse row indexes for the data
-	 */
+	
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
-	/**
-	 * Pointers to row indexes in the dictionary. Note the dictionary has one extra entry.
-	 */
+	/** Pointers to row indexes in the dictionary. Note the dictionary has one extra entry. */
 	protected transient AMapToData _data;
 
 	/**
@@ -66,7 +58,7 @@ public class ColGroupSDC extends AColGroupValue {
 		super(numRows);
 	}
 
-	protected ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	private ColGroupSDC(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
 		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
@@ -74,6 +66,14 @@ public class ColGroupSDC extends AColGroupValue {
 		_zeros = false;
 	}
 
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDC(colIndices, numRows, dict, offsets, data, cachedCounts);
+	}
+
 	@Override
 	public CompressionType getCompType() {
 		return CompressionType.SDC;
@@ -85,183 +85,153 @@ public class ColGroupSDC extends AColGroupValue {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
+		final int rowOff = it == null || it.value() != r ? getNumValues() - 1 : _data.getIndex(it.getDataIndex());
 		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offset + j];
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		_indexes.cacheIterator(it, ru);
-	}
-
-	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-		// final int offsetToDefault = sb.numRows() - 1;
-		// final int defApos = sb.pos(offsetToDefault);
-		// final int defAlen = sb.size(offsetToDefault) + defApos;
-		// final double[] defAvals = sb.values(offsetToDefault);
-		// final int[] defAix = sb.indexes(offsetToDefault);
-		// final DenseBlock db = target.getDenseBlock();
-
-		// int i = rl;
-		// AIterator it = _indexes.getIterator(rl);
-		// for(; i < ru && it.hasNext(); i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// if(it.value() == i) {
-		// int dictIndex = _data.getIndex(it.getDataIndexAndIncrement());
-		// if(sb.isEmpty(dictIndex))
-		// continue;
-		// final int apos = sb.pos(dictIndex);
-		// final int alen = sb.size(dictIndex) + apos;
-		// final double[] avals = sb.values(dictIndex);
-		// final int[] aix = sb.indexes(dictIndex);
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-		// }
-		// else
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// for(; i < ru; i++, offT++) {
-		// final double[] c = db.values(offT);
-		// final int off = db.pos(offT);
-		// for(int j = defApos; j < defAlen; j++)
-		// c[off + _colIndexes[defAix[j]]] += defAvals[j];
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		return _dict.getValue(rowOff * nCol + colIdx);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
-	}
+	protected void computeRowSums(double[] c, int rl, int ru) {
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				int offset = _data.getIndex(it.getDataIndexAndIncrement()) * nCol;
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offset + j]);
-				// c[off + _colIndexes[j]] += values[offset + j];
+		final int numVals = getNumValues();
+		int r = rl;
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		final double def = vals[numVals - 1];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxId = _data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[_data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
 		}
-
-		for(; i < ru; i++, offT++) {
-			// final double[] c = db.values(offT);
-			// final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-			// c[off + _colIndexes[j]] += values[offsetToDefault + j];
+		else if(it != null) {
+			while(it.isNotOver(ru)) {
+				if(it.value() == r)
+					c[r] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			_indexes.cacheIterator(it, ru);
 		}
 
-		_indexes.cacheIterator(it, ru);
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
-	public double getIdx(int r, int colIdx) {
-		final AIterator it = _indexes.getIterator(r);
-		final int nCol = _colIndexes.length;
-		final int rowOff = it.value() == r ? getIndex(it.getDataIndex()) * nCol : getNumValues() * nCol - nCol;
-		return _dict.getValue(rowOff + colIdx);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSumsSq(c, rl, ru, vals, _data, _indexes, _numRows);
 	}
 
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final int numVals = getNumValues();
-		// // pre-aggregate nnz per value tuple
-		double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-
-		int rix = rl;
-		AIterator it = _indexes.getIterator(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[numVals - 1];
-			else {
-				c[rix] += vals[_data.getIndex(it.getDataIndexAndIncrement())];
+	protected static final void computeRowSumsSq(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		final double def = vals[vals.length - 1];
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] += vals[data.getIndex(it.getDataIndex())];
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[numVals - 1];
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += vals[data.getIndex(it.getDataIndexAndIncrement())];
+				else
+					c[r] += def;
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
+		while(r < ru) {
+			c[r] += def;
+			r++;
+		}
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final int numVals = getNumValues();
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, vals[vals.length - 1]);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] vals,
+		AMapToData data, AOffset indexes, int nRows, double def) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndex())]);
+					if(it.getDataIndex() < maxId)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] = builtin.execute(c[r], vals[data.getIndex(it.getDataIndexAndIncrement())]);
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
 		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[numVals - 1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
 	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int defaults = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += defaults;
-
-		return counts;
-	}
-
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -274,19 +244,19 @@ public class ColGroupSDC extends AColGroupValue {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDC(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-		return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
@@ -311,49 +281,26 @@ public class ColGroupSDC extends AColGroupValue {
 		return ret;
 	}
 
-	public ColGroupSDCZeros extractCommon(double[] constV) {
+	@Override
+	public AColGroup extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 		if(commonV == null) // The common tuple was all zero. Therefore this column group should never have been SDC.
-			return new ColGroupSDCZeros(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
+			return ColGroupSDCZeros.create(_colIndexes, _numRows, _dict, _indexes, _data, getCounts());
 
 		for(int i = 0; i < _colIndexes.length; i++)
 			constV[_colIndexes[i]] += commonV[i];
 
 		ADictionary subtractedDict = _dict.subtractTuple(commonV);
-		return new ColGroupSDCZeros(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
-	}
-
-	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
+		return ColGroupSDCZeros.create(_colIndexes, _numRows, subtractedDict, _indexes, _data, getCounts());
 	}
 
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(String.format("\n%15s", "Indexes: "));
 		sb.append(_indexes.toString());
-		sb.append(String.format("\n%15s ", "Data: "));
+		sb.append(String.format("\n%15s", "Data: "));
 		sb.append(_data.toString());
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index cb123ec..a41198d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -23,16 +23,11 @@ import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
-import org.apache.sysds.runtime.data.DenseBlock;
-import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
@@ -44,11 +39,9 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
  * This column group is handy in cases where sparse unsafe operations is executed on very sparse columns. Then the zeros
  * would be materialized in the group without any overhead.
  */
-public class ColGroupSDCSingle extends AColGroupValue {
+public class ColGroupSDCSingle extends AMorphingMMColGroup {
 	private static final long serialVersionUID = 3883228464052204200L;
-	/**
-	 * Sparse row indexes for the data
-	 */
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
 
 	/**
@@ -77,125 +70,112 @@ public class ColGroupSDCSingle extends AColGroupValue {
 	}
 
 	@Override
-	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
-		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[j];
-				it.next();
-			}
-			else
-				for(int j = 0; j < nCol; j++)
-					c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		for(; i < ru; i++, offT++) {
-			final double[] c = db.values(offT);
-			final int off = db.pos(offT) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offsetToDefault + j];
-		}
-
-		_indexes.cacheIterator(it, ru);
+	public double getIdx(int r, int colIdx) {
+		final AIterator it = _indexes.getIterator(r);
+		if(it == null || it.value() != r)
+			return _dict.getValue(_colIndexes.length + colIdx);
+		return _dict.getValue(colIdx);
 	}
 
 	@Override
-	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		SparseBlock sb) {
-		throw new NotImplementedException();
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
-	@Override
-	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-		double[] values) {
-		final int nCol = _colIndexes.length;
-		final int offsetToDefault = values.length - nCol;
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		int r = rl;
 		final AIterator it = _indexes.getIterator(rl);
-
-		int offT = rl + offR;
-		int i = rl;
-		for(; i < ru && it.hasNext(); i++, offT++) {
-			if(it.value() == i) {
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[j]);
-				it.next();
+		final double def = vals[1];
+		final double norm = vals[0];
+		if(it != null && it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] += norm;
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] += def;
+				r++;
 			}
-			else
-				for(int j = 0; j < nCol; j++)
-					ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
 		}
-
-		for(; i < ru; i++, offT++)
-			for(int j = 0; j < nCol; j++)
-				ret.append(offT, _colIndexes[j] + offC, values[offsetToDefault + j]);
-
-		_indexes.cacheIterator(it, ru);
-	}
-
-	@Override
-	public double getIdx(int r, int colIdx) {
-		AIterator it = _indexes.getIterator(r);
-		if(it.value() == r)
-			return _dict.getValue(colIdx);
-		else
-			return _dict.getValue(_colIndexes.length + colIdx);
-	}
-
-	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-
-		// // pre-aggregate nnz per value tuple
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator();
-
-		int rix = rl;
-		it.skipTo(rl);
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] += vals[1];
-			else {
-				c[rix] += vals[0];
-				it.next();
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r)
+					c[r] += norm;
+				else
+					c[r] += def;
+				r++;
 			}
+			_indexes.cacheIterator(it, ru);
 		}
-		for(; rix < ru; rix++) {
-			c[rix] += vals[1];
+
+		while(r < ru) {
+			c[r] += def;
+			r++;
 		}
 	}
 
 	@Override
 	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], vals[1]);
-			else {
-				c[rix] = builtin.execute(c[rix], vals[0]);
-				it.next();
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, vals[1], vals[0]);
+	}
+
+	protected static final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, AOffset indexes, int nRows,
+		double def, double norm) {
+		int r = rl;
+		final AIterator it = indexes.getIterator(rl);
+		if(it != null && it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(it != null && ru >= indexes.getOffsetToLast()) {
+			final int maxOff = indexes.getOffsetToLast();
+			while(true) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					if(it.value() < maxOff)
+						it.next();
+					else {
+						r++;
+						break;
+					}
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
 			}
 		}
+		else if(it != null) {
+			while(r < ru) {
+				if(it.value() == r) {
+					c[r] = builtin.execute(c[r], norm);
+					it.next();
+				}
+				else
+					c[r] = builtin.execute(c[r], def);
+				r++;
+			}
+			indexes.cacheIterator(it, ru);
+		}
 
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], vals[1]);
+		while(r < ru) {
+			c[r] = builtin.execute(c[r], def);
+			r++;
+		}
 	}
 
 	@Override
@@ -214,7 +194,7 @@ public class ColGroupSDCSingle extends AColGroupValue {
 
 	@Override
 	public AColGroup scalarOperation(ScalarOperator op) {
-		return new ColGroupSDCSingle(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+		return new ColGroupSDCSingle(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 	}
 
 	@Override
@@ -248,6 +228,7 @@ public class ColGroupSDCSingle extends AColGroupValue {
 		return ret;
 	}
 
+	@Override
 	public ColGroupSDCSingleZeros extractCommon(double[] constV) {
 		double[] commonV = _dict.getTuple(getNumValues() - 1, _colIndexes.length);
 
@@ -262,34 +243,10 @@ public class ColGroupSDCSingle extends AColGroupValue {
 	}
 
 	@Override
-	public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, int rl, int ru) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
-	public void tsmmAColGroup(AColGroup other, MatrixBlock result) {
-		// This method should not be called since if there is a matrix multiplication
-		// the default value is transformed to be zero, and this column group would be allocated as a
-		// SDC Zeros version
-		throw new DMLCompressionException("This method should never be called");
-	}
-
-	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(String.format("\n%15s", "Indexes: "));
 		sb.append(_indexes.toString());
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index d8edd0d..ca2415c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -46,9 +46,8 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
  */
 public class ColGroupSDCSingleZeros extends APreAgg {
 	private static final long serialVersionUID = 8033235615964315078L;
-	/**
-	 * Sparse row indexes for the data
-	 */
+	
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
 
 	/**
@@ -80,104 +79,179 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final double[] c = db.values(row);
-			final int off = db.pos(row) + offC;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[j];
-
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+				if(it.value() < maxOff)
+					it.next();
+				else
+					break;
+			}
 		}
-		_indexes.cacheIterator(it, ru);
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j] + offC] += values[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-		SparseBlock values) {
-		throw new NotImplementedException();
-		// final int offTCorr = offT - rl;
-		// final DenseBlock db = target.getDenseBlock();
-		// final int apos = values.pos(0);
-		// final int alen = values.size(0) + apos;
-		// final int[] aix = values.indexes(0);
-		// final double[] avals = values.values(0);
-
-		// AIterator it = _indexes.getIterator(rl);
-		// while(it.hasNext() && it.value() < ru) {
-		// final int idx = offTCorr + it.value();
-		// final double[] c = db.values(idx);
-		// final int off = db.pos(idx);
-
-		// for(int j = apos; j < alen; j++)
-		// c[off + _colIndexes[aix[j]]] += avals[j];
-
-		// it.next();
-		// }
-
-		// _indexes.cacheIterator(it, ru);
+		SparseBlock sb) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final double[] c = db.values(row);
+				final int off = db.pos(row);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]] + offC] += avals[j];
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		throw new NotImplementedException();
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			throw new NotImplementedException();
+		}
+		else {
+			final int apos = sb.pos(0);
+			final int alen = sb.size(0) + apos;
+			final int[] aix = sb.indexes(0);
+			final double[] avals = sb.values(0);
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
+		}
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[j]);
-			it.next();
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int nCol = _colIndexes.length;
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	public double getIdx(int r, int colIdx) {
 		final AIterator it = _indexes.getIterator(r);
-		if(it.value() == r)
-			return _dict.getValue(colIdx);
-		else
-			return 0.0;
+		if(it == null || it.value() != r)
+			return 0;
+		return _dict.getValue(colIdx);
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double vals = _dict.sumAllRowsToDouble(square, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			c[it.value()] += vals;
-			it.next();
-		}
-
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDouble(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double vals = _dict.aggregateTuples(builtin, _colIndexes.length)[0];
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double def = _dict.sumAllRowsToDoubleSq(_colIndexes.length)[0];
+		computeRowSum(c, rl, ru, def);
+	}
 
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else {
-				c[rix] = builtin.execute(c[rix], vals);
+	protected void computeRowSum(double[] c, int rl, int ru, double def) {
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int maxOff = _indexes.getOffsetToLast();
+			while(true) {
+				c[it.value()] += def;
+				if(it.value() == maxOff)
+					break;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += def;
 				it.next();
 			}
+			_indexes.cacheIterator(it, ru);
 		}
+	}
 
-		// cover remaining rows
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDCSingle.computeRowMxx(c, builtin, rl, ru, _indexes, _numRows, 0, vals[0]);
 	}
 
 	@Override
@@ -192,71 +266,83 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 		if(m.isInSparseFormat())
 			preAggregateSparse(m.getSparseBlock(), preAgg, rl, ru);
 		else
-			preAggregateDense(m, preAgg, rl, ru);
+			preAggregateDense(m, preAgg, rl, ru, 0, _numRows);
 	}
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-		final double[] mV = m.getDenseBlockValues();
+		final AIterator it = _indexes.getIterator(cl);
 		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		final int blockSize = 2000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, cu);
-			final AIterator itStart = _indexes.getIterator(block);
-			AIterator it;
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-				final int offLeft = rowLeft * _numRows;
-				it = itStart.clone();
-				while(it.value() < blockEnd && it.hasNext()) {
-					final int i = it.value();
-					preAV[offOut] += mV[offLeft + i];
-					it.next();
-				}
+		final double[] vals = m.getDenseBlockValues();
+		final int nCol = m.getNumColumns();
+		if(it == null)
+			return;
+		else if(it.value() > cu)
+			_indexes.cacheIterator(it, cu);
+		else if(cu < _indexes.getOffsetToLast() + 1) {
+			while(it.value() < cu) {
+				final int start = it.value() + nCol * rl;
+				final int end = it.value() + nCol * ru;
+				for(int offOut = 0, off = start; off < end; offOut ++, off += nCol)
+					preAV[offOut] += vals[off];
+				it.next();
 			}
+			_indexes.cacheIterator(it, cu);
 		}
-	}
-
-	private void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			final AIterator it = _indexes.getIterator();
-			final int offLeft = rowLeft * _numRows;
-			while(it.hasNext()) {
-				final int i = it.value();
-				preAV[offOut] += mV[offLeft + i];
+		else {
+			int of = it.value();
+			int start = of + nCol * rl;
+			int end = of + nCol * ru;
+			for(int offOut = 0, off = start; off < end; offOut ++, off += nCol)
+				preAV[offOut] += vals[off];
+			while(of < _indexes.getOffsetToLast()) {
 				it.next();
+				of = it.value();
+				start = of + nCol * rl;
+				end = of + nCol * ru;
+				for(int offOut = 0, off = start; off < end; offOut ++, off += nCol)
+					preAV[offOut] += vals[off];
 			}
 		}
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
+		final AIterator it = _indexes.getIterator();
+		if(rl == ru - 1) {
+			final int apos = sb.pos(rl);
+			final int alen = sb.size(rl) + apos;
+			final int[] aix = sb.indexes(rl);
+			final double[] avals = sb.values(rl);
+			final int offsetToLast = _indexes.getOffsetToLast();
+
+			double ret = 0;
 			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int v = it.value();
-				if(index < v)
-					j++;
-				else if(index == v) {
-					preAV[offOut] += avals[j++];
+
+			while(true) {
+				final int idx = aix[j];
+
+				if(idx == it.value()) {
+					ret += avals[j++];
+					if(j >= alen || it.value() >= offsetToLast)
+						break;
 					it.next();
 				}
-				else
+				else if(idx < it.value()) {
+					j++;
+					if(j >= alen)
+						break;
+				}
+				else {
+					if(it.value() >= offsetToLast)
+						break;
 					it.next();
+				}
 			}
+
+			preAgg.setValue(0, 0, ret);
 		}
+		else
+			throw new NotImplementedException();
 	}
 
 	@Override
@@ -271,9 +357,9 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, getCachedCounts());
+			return new ColGroupSDCSingleZeros(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, getCachedCounts());
 		else {
-			ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());// swapEntries();
+			ADictionary aDictionary = _dict.applyScalarOp(op, val0, getNumCols());// swapEntries();
 			// ADictionary aDictionary = applyScalarOp(op, val0, getNumCols());
 			return new ColGroupSDCSingle(_colIndexes, _numRows, aDictionary, _indexes, null);
 		}
@@ -336,10 +422,15 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 	public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
 			that._dict.addToEntry(ret, fr, 0, nCol);
-			itThis.next();
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -348,30 +439,78 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
-
-		while(itThat.hasNext() && itThis.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
-				that._dict.addToEntry(ret, that.getIndex(itThat.getDataIndex()), 0, nCol);
-
-			itThat.next();
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
+				that._dict.addToEntry(ret, that._data.getIndex(itThat.getDataIndex()), 0, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
 		}
 	}
 
 	@Override
 	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
+		final int nCol = that._colIndexes.length;
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
-		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext()) {
-			final int v = itThat.value();
-			if(v == itThis.skipTo(v))
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
+			if(itThat.value() == itThis.value()) {
 				that._dict.addToEntry(ret, 0, 0, nCol);
-			itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+
 		}
 	}
 
 	@Override
+	public int getPreAggregateSize(){
+		return 1;
+	}
+
+	@Override
 	public AColGroup replace(double pattern, double replace) {
 		if(pattern == 0)
 			return replaceZero(replace);
@@ -388,7 +527,7 @@ public class ColGroupSDCSingleZeros extends APreAgg {
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(String.format("\n%15s", "Indexes: "));
 		sb.append(_indexes.toString());
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index a7632dd..ee3bad4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -23,13 +23,9 @@ import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToByte;
-import org.apache.sysds.runtime.compress.colgroup.mapping.MapToChar;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
@@ -37,6 +33,7 @@ import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -53,14 +50,10 @@ import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 public class ColGroupSDCZeros extends APreAgg {
 	private static final long serialVersionUID = -3703199743391937991L;
 
-	/**
-	 * Sparse row indexes for the data
-	 */
+	/** Sparse row indexes for the data */
 	protected transient AOffset _indexes;
 
-	/**
-	 * Pointers to row indexes in the dictionary. Note the dictionary has one extra entry.
-	 */
+	/** Pointers to row indexes in the dictionary. Note the dictionary has one extra entry. */
 	protected transient AMapToData _data;
 
 	/**
@@ -72,19 +65,20 @@ public class ColGroupSDCZeros extends APreAgg {
 		super(numRows);
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data) {
-		super(colIndices, numRows, dict, null);
+	private ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+		int[] cachedCounts) {
+		super(colIndices, numRows, dict, cachedCounts);
 		_indexes = offsets;
 		_data = data;
 		_zeros = true;
 	}
 
-	protected ColGroupSDCZeros(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
+	protected static AColGroup create(int[] colIndices, int numRows, ADictionary dict, AOffset offsets, AMapToData data,
 		int[] cachedCounts) {
-		super(colIndices, numRows, dict, cachedCounts);
-		_indexes = offsets;
-		_data = data;
-		_zeros = true;
+		if(dict == null)
+			return new ColGroupEmpty(colIndices);
+		else
+			return new ColGroupSDCZeros(colIndices, numRows, dict, offsets, data, cachedCounts);
 	}
 
 	@Override
@@ -100,129 +94,255 @@ public class ColGroupSDCZeros extends APreAgg {
 	@Override
 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				c[off + _colIndexes[j]] += values[offDict + j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int offDict = _data.getIndex(it.getDataIndex()) * nCol;
+				for(int j = 0; j < nCol; j++)
+					c[off + _colIndexes[j]] += values[offDict + j];
 
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int idx = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final double[] c = db.values(idx);
-			final int off = db.pos(idx) + offC;
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				c[off + _colIndexes[aix[j]]] += avals[j];
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int idx = offR + it.value();
+				final double[] c = db.values(idx);
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int idx = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final double[] c = db.values(idx);
+				final int off = db.pos(idx) + offC;
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					c[off + _colIndexes[aix[j]]] += avals[j];
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		SparseBlock sb) {
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int dictIndex = getIndex(it.getDataIndexAndIncrement());
-			if(sb.isEmpty(dictIndex))
-				continue;
-
-			final int apos = sb.pos(dictIndex);
-			final int alen = sb.size(dictIndex) + apos;
-			final double[] avals = sb.values(dictIndex);
-			final int[] aix = sb.indexes(dictIndex);
-			for(int j = apos; j < alen; j++)
-				ret.append(row, _colIndexes[aix[j]] + offC, avals[j] );
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					if(it.value() == lastOff)
+						return;
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int dictIndex = _data.getIndex(dx);
+				if(sb.isEmpty(dictIndex)) {
+					it.next();
+					continue;
+				}
+
+				final int apos = sb.pos(dictIndex);
+				final int alen = sb.size(dictIndex) + apos;
+				final double[] avals = sb.values(dictIndex);
+				final int[] aix = sb.indexes(dictIndex);
+				for(int j = apos; j < alen; j++)
+					ret.append(row, _colIndexes[aix[j]] + offC, avals[j]);
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
 	}
 
 	@Override
 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
 		double[] values) {
-		final int nCol = _colIndexes.length;
+		final AIterator it = _indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() >= ru)
+			_indexes.cacheIterator(it, ru);
+		else if(ru > _indexes.getOffsetToLast()) {
+			final int lastOff = _indexes.getOffsetToLast();
+			final int nCol = _colIndexes.length;
+			while(true) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+				if(it.value() == lastOff)
+					return;
+				it.next();
+			}
+		}
+		else {
 
-		AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru) {
-			final int row = offR + it.value();
-			final int offDict = getIndex(it.getDataIndexAndIncrement()) * nCol;
-			for(int j = 0; j < nCol; j++)
-				ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+			final int nCol = _colIndexes.length;
+			while(it.isNotOver(ru)) {
+				final int row = offR + it.value();
+				final int dx = it.getDataIndex();
+				final int offDict = _data.getIndex(dx) * nCol;
+				for(int j = 0; j < nCol; j++)
+					ret.append(row, _colIndexes[j] + offC, values[offDict + j]);
+
+				it.next();
+			}
+			_indexes.cacheIterator(it, ru);
 		}
-		_indexes.cacheIterator(it, ru);
+
 	}
 
 	@Override
 	public double getIdx(int r, int colIdx) {
 		final AIterator it = _indexes.getIterator(r);
+		if(it == null || it.value() != r)
+			return 0;
 		final int nCol = _colIndexes.length;
-		if(it.value() == r)
-			return _dict.getValue(getIndex(it.getDataIndex()) * nCol + colIdx);
-		else
-			return 0.0;
+		return _dict.getValue(_data.getIndex(it.getDataIndex()) * nCol + colIdx);
 	}
 
 	@Override
-	protected void computeRowSums(double[] c, boolean square, int rl, int ru) {
-		final double[] vals = _dict.sumAllRowsToDouble(square, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		while(it.hasNext() && it.value() < ru)
-			c[it.value()] += vals[getIndex(it.getDataIndexAndIncrement())];
+	protected void computeRowSums(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDouble(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
 	}
 
 	@Override
-	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
-		final double[] vals = _dict.aggregateTuples(builtin, _colIndexes.length);
-		final AIterator it = _indexes.getIterator(rl);
-		int rix = rl;
-
-		for(; rix < ru && it.hasNext(); rix++) {
-			if(it.value() != rix)
-				c[rix] = builtin.execute(c[rix], 0);
-			else
-				c[rix] = builtin.execute(c[rix], vals[_data.getIndex(it.getDataIndexAndIncrement())]);
+	protected void computeRowSumsSq(double[] c, int rl, int ru) {
+		final double[] vals = _dict.sumAllRowsToDoubleSq(_colIndexes.length);
+		computeRowSums(c, rl, ru, vals);
+	}
+
+	protected void computeRowSums(double[] c, int rl, int ru, double[] vals) {
+		computeRowSums(c, rl, ru, vals, _data, _indexes, _numRows);
+	}
+
+	protected static final void computeRowSums(double[] c, int rl, int ru, double[] vals, AMapToData data,
+		AOffset indexes, int nRows) {
+		final AIterator it = indexes.getIterator(rl);
+		if(it == null)
+			return;
+		else if(it.value() > ru)
+			indexes.cacheIterator(it, ru);
+		else if(ru >= indexes.getOffsetToLast()) {
+			final int maxId = data.size() - 1;
+			c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			while(it.getDataIndex() < maxId) {
+				it.next();
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+			}
+		}
+		else {
+			while(it.isNotOver(ru)) {
+				c[it.value()] += vals[data.getIndex(it.getDataIndex())];
+				it.next();
+			}
+			indexes.cacheIterator(it, ru);
 		}
-
-		// cover remaining rows with default value
-		for(; rix < ru; rix++)
-			c[rix] = builtin.execute(c[rix], 0);
 	}
 
 	@Override
-	public int[] getCounts(int[] counts) {
-		final int nonDefaultLength = _data.size();
-		// final AIterator it = _indexes.getIterator();
-		final int zeros = _numRows - nonDefaultLength;
-		for(int i = 0; i < nonDefaultLength; i++)
-			counts[_data.getIndex(i)]++;
-
-		counts[counts.length - 1] += zeros;
-
-		return counts;
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final double[] vals = _dict.aggregateRows(builtin, _colIndexes.length);
+		ColGroupSDC.computeRowMxx(c, builtin, rl, ru, vals, _data, _indexes, _numRows, 0);
 	}
 
-	public int getIndex(int r) {
-		return _data.getIndex(r);
+	@Override
+	public int[] getCounts(int[] counts) {
+		return _data.getCounts(counts, _numRows);
 	}
 
 	@Override
@@ -235,82 +355,11 @@ public class ColGroupSDCZeros extends APreAgg {
 
 	@Override
 	public void preAggregateDense(MatrixBlock m, MatrixBlock preAgg, int rl, int ru, int cl, int cu) {
-
-		final int numVals = getNumValues();
-		if(cl != 0 && cu != preAgg.getNumColumns())
-			throw new NotImplementedException("Not implemented preAggregate of sub number of columns");
-		if(_data instanceof MapToByte)
-			preAggregateDenseByte(m, preAgg, ((MapToByte) _data).getBytes(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else if(_data instanceof MapToChar)
-			preAggregateDenseChar(m, preAgg, ((MapToChar) _data).getChars(), rl, ru, cl, cu, _numRows, numVals, _indexes);
-		else
-			throw new DMLCompressionException("Unsupported map type:" + _data);
-
-	}
-
-	private static void preAggregateDenseByte(final MatrixBlock m, final MatrixBlock preAgg, final byte[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()] & 0xFF;
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
-	}
-
-	private static void preAggregateDenseChar(final MatrixBlock m, final MatrixBlock preAgg, final char[] d,
-		final int rl, final int ru, final int cl, final int cu, final int nRow, final int nVal, AOffset indexes) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		// multi row iterator.
-		final AIterator itStart = indexes.getIterator(cl);
-		AIterator it = null;
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-			final int offLeft = rowLeft * nRow;
-			it = itStart.clone();
-			while(it.value() < cu && it.hasNext()) {
-				int i = it.value();
-				int index = d[it.getDataIndexAndIncrement()];
-				preAV[offOut + index] += mV[offLeft + i];
-			}
-		}
-		if(it != null && cu < m.getNumColumns())
-			indexes.cacheIterator(it, cu);
+		_data.preAggregateDense(m, preAgg.getDenseBlockValues(), rl, ru, cl, cu, _indexes);
 	}
 
 	private void preAggregateSparse(SparseBlock sb, MatrixBlock preAgg, int rl, int ru) {
-		final double[] preAV = preAgg.getDenseBlockValues();
-		final int numVals = getNumValues();
-		for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += numVals) {
-			if(sb.isEmpty(rowLeft))
-				continue;
-			final AIterator it = _indexes.getIterator();
-			final int apos = sb.pos(rowLeft);
-			final int alen = sb.size(rowLeft) + apos;
-			final int[] aix = sb.indexes(rowLeft);
-			final double[] avals = sb.values(rowLeft);
-			int j = apos;
-			while(it.hasNext() && j < alen) {
-				final int index = aix[j];
-				final int val = it.value();
-				if(index < val)
-					j++;
-				else if(index == val)
-					preAV[offOut + _data.getIndex(it.getDataIndexAndIncrement())] += avals[j++];
-				else
-					it.next();
-			}
-		}
+		_data.preAggregateSparse(sb, preAgg.getDenseBlockValues(), rl, ru, _indexes);
 	}
 
 	@Override
@@ -326,10 +375,10 @@ public class ColGroupSDCZeros extends APreAgg {
 		double val0 = op.executeScalar(0);
 		boolean isSparseSafeOp = op.sparseSafe || val0 == 0;
 		if(isSparseSafeOp)
-			return new ColGroupSDCZeros(_colIndexes, _numRows, applyScalarOp(op), _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, _dict.applyScalarOp(op), _indexes, _data, getCachedCounts());
 		else {
-			ADictionary rValues = applyScalarOp(op, val0, getNumCols());
-			return new ColGroupSDC(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
+			ADictionary rValues = _dict.applyScalarOp(op, val0, getNumCols());
+			return ColGroupSDC.create(_colIndexes, _numRows, rValues, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -337,11 +386,15 @@ public class ColGroupSDCZeros extends APreAgg {
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpLeft(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowLeft(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpLeftAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -349,11 +402,15 @@ public class ColGroupSDCZeros extends APreAgg {
 	public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe) {
 		if(isRowSafe) {
 			ADictionary ret = _dict.binOpRight(op, v, _colIndexes);
-			return new ColGroupSDCZeros(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+		}
+		else if(op.fn instanceof Plus) {
+			double[] def = ColGroupUtils.binaryDefRowRight(op, v, _colIndexes);
+			return ColGroupPFOR.create(_colIndexes, _numRows, _dict, _indexes, _data, getCachedCounts(), def);
 		}
 		else {
 			ADictionary ret = _dict.applyBinaryRowOpRightAppendNewEntry(op, v, _colIndexes);
-			return new ColGroupSDC(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
+			return ColGroupSDC.create(_colIndexes, _numRows, ret, _indexes, _data, getCachedCounts());
 		}
 	}
 
@@ -394,10 +451,15 @@ public class ColGroupSDCZeros extends APreAgg {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		while(true) {
 			final int fr = that._data.getIndex(itThis.value());
-			final int to = getIndex(itThis.getDataIndexAndIncrement());
+			final int to = _data.getIndex(itThis.getDataIndex());
 			that._dict.addToEntry(ret, fr, to, nCol);
+			if(itThis.value() >= finalOffThis)
+				break;
+			else
+				itThis.next();
 		}
 	}
 
@@ -405,17 +467,37 @@ public class ColGroupSDCZeros extends APreAgg {
 	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
 		final AIterator itThat = that._indexes.getIterator();
 		final AIterator itThis = _indexes.getIterator();
+
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
 		final int nCol = that._colIndexes.length;
-		while(itThat.hasNext() && itThis.hasNext()) {
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int fr = that.getIndex(itThat.getDataIndexAndIncrement());
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int fr = that._data.getIndex(itThat.getDataIndex());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, fr, to, nCol);
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -425,16 +507,34 @@ public class ColGroupSDCZeros extends APreAgg {
 		final AIterator itThis = _indexes.getIterator();
 		final int nCol = that._colIndexes.length;
 
-		while(itThat.hasNext() && itThis.hasNext()) {
+		final int finalOffThis = _indexes.getOffsetToLast();
+		final int finalOffThat = that._indexes.getOffsetToLast();
+
+		while(true) {
 			if(itThat.value() == itThis.value()) {
-				final int to = getIndex(itThis.getDataIndexAndIncrement());
+				final int to = _data.getIndex(itThis.getDataIndex());
 				that._dict.addToEntry(ret, 0, to, nCol);
-				itThat.next();
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
+			}
+			else if(itThat.value() < itThis.value()) {
+				if(itThat.value() >= finalOffThat)
+					break;
+				else
+					itThat.next();
+			}
+			else {
+				if(itThis.value() >= finalOffThis)
+					break;
+				else
+					itThis.next();
 			}
-			else if(itThat.value() < itThis.value())
-				itThat.next();
-			else
-				itThis.next();
 		}
 	}
 
@@ -448,16 +548,16 @@ public class ColGroupSDCZeros extends APreAgg {
 
 	private AColGroup replaceZero(double replace) {
 		ADictionary replaced = _dict.replaceZeroAndExtend(replace, _colIndexes.length);
-		return new ColGroupSDC(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
+		return ColGroupSDC.create(_colIndexes, _numRows, replaced, _indexes, _data, getCachedCounts());
 	}
 
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
-		sb.append(String.format("\n%15s ", "Indexes: "));
+		sb.append(String.format("\n%15s", "Indexes: "));
 		sb.append(_indexes.toString());
-		sb.append(String.format("\n%15s ", "Data: "));
+		sb.append(String.format("\n%15s", "Data: "));
 		sb.append(_data);
 		return sb.toString();
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
index 38b5998..49d3197 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
@@ -81,7 +81,7 @@ public final class ColGroupSizes {
 
 	public static long estimateInMemorySizeSDC(int nrColumns, int nrValues, int nrRows, int largestOff,
 		boolean largestOffIsZero, boolean containNoZeroValues, double tupleSparsity, boolean lossy) {
-		final int nVals = nrValues + (largestOffIsZero || containNoZeroValues ? 0 : 1);
+		final int nVals = nrValues ;
 		long size = estimateInMemorySizeGroupValue(nrColumns, nVals, tupleSparsity, lossy);
 		size += OffsetFactory.estimateInMemorySize(nrRows - largestOff, nrRows);
 		if(nrValues > 1)
@@ -91,7 +91,7 @@ public final class ColGroupSizes {
 
 	public static long estimateInMemorySizeSDCSingle(int nrColumns, int nrValues, int nrRows, int largestOff,
 		boolean largestOffIsZero, boolean containNoZeroValues, double tupleSparsity, boolean lossy) {
-		final int nVals = nrValues + (largestOffIsZero || containNoZeroValues ? 0 : 1);
+		final int nVals = nrValues ;
 		long size = estimateInMemorySizeGroupValue(nrColumns, nVals, tupleSparsity, lossy);
 		size += OffsetFactory.estimateInMemorySize(nrRows - largestOff, nrRows);
 		return size;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index 07b3d88..ad5bdbe 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -298,8 +298,8 @@ public class ColGroupUncompressed extends AColGroup {
 
 	@Override
 	public AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe) {
-		MatrixBlock rowVector = Util.extractValues(v, _colIndexes);
-		return new ColGroupUncompressed(_colIndexes, rowVector.binaryOperations(op, _data, null));
+		throw new NotImplementedException("Binary row op left is not supported for Uncompressed Matrix, "
+			+ "Implement support for VMr in MatrixBLock Binary Cell operations");
 	}
 
 	@Override
@@ -451,9 +451,9 @@ public class ColGroupUncompressed extends AColGroup {
 
 	@Override
 	public void leftMultByAColGroup(AColGroup lhs, MatrixBlock result) {
-		if(lhs instanceof ColGroupEmpty)
+		if(lhs instanceof ColGroupEmpty || getData().isEmpty())
 			return;
-		if(lhs instanceof ColGroupUncompressed) {
+		else if(lhs instanceof ColGroupUncompressed) {
 			ColGroupUncompressed lhsUC = (ColGroupUncompressed) lhs;
 			MatrixBlock tmpRet = new MatrixBlock(lhs.getNumCols(), _colIndexes.length, 0);
 
@@ -499,43 +499,46 @@ public class ColGroupUncompressed extends AColGroup {
 				}
 			}
 		}
-		else {
+		else if(lhs instanceof APreAgg) {
+			// throw new NotImplementedException();
 			LOG.warn("\nInefficient transpose of uncompressed to fit to"
 				+ " t(AColGroup) %*% UncompressedColGroup mult by colGroup uncompressed column"
 				+ "\nCurrently solved by t(t(Uncompressed) %*% AColGroup)");
-			MatrixBlock ucCG = getData();
-			// make a function that allows the result of the mult to be directly output to a temporary matrix.
-			MatrixBlock tmpTransposedResult = new MatrixBlock(ucCG.getNumColumns(), result.getNumColumns(), false);
-			tmpTransposedResult.allocateDenseBlock();
-
-			MatrixBlock tmp = LibMatrixReorg.transpose(ucCG, InfrastructureAnalyzer.getLocalParallelism());
-			lhs.leftMultByMatrix(tmp, tmpTransposedResult, 0, tmp.getNumRows());
-			tmpTransposedResult.setNonZeros(ucCG.getNumColumns() * result.getNumColumns());
-
-			final double[] resV = result.getDenseBlockValues();
-			final int[] lhsC = lhs._colIndexes;
-			final int[] rhsC = _colIndexes;
-
-			// allocate the resulting matrix into the correct result indexes.
-			// Note that the intermediate matrix is transposed, therefore the indexes are different than a normal
-			// allocation.
 
-			if(tmpTransposedResult.isEmpty())
-				return;
-			else if(tmpTransposedResult.isInSparseFormat())
-				throw new NotImplementedException();
-			else {
-				final double[] tmpV = tmpTransposedResult.getDenseBlockValues();
-				final int nCol = result.getNumColumns();
-
-				for(int row = 0; row < rhsC.length; row++) {
-					final int offR = rhsC[row];
-					final int offT = row * nCol;
-					for(int col = 0; col < lhsC.length; col++)
-						resV[offR + lhsC[col] * nCol] += tmpV[offT + lhsC[col]];
+			final MatrixBlock ucCGT = LibMatrixReorg.transpose(getData(), InfrastructureAnalyzer.getLocalParallelism());
+			
+			final APreAgg paCG = (APreAgg) lhs;
+			final MatrixBlock preAgg = new MatrixBlock(1, lhs.getNumValues(), false);
+			final MatrixBlock tmpRes = new MatrixBlock(1, this.getNumCols(), false);
+			final MatrixBlock dictM =  paCG._dict.getMBDict(paCG.getNumCols()).getMatrixBlock();
+			preAgg.allocateDenseBlock();
+			tmpRes.allocateDenseBlock();
+			final int nRows = ucCGT.getNumRows();
+			final int nCols = lhs.getNumCols();
+			final double[] retV = result.getDenseBlockValues();
+			final double[] tmpV = tmpRes.getDenseBlockValues();
+			final int retCols = result.getNumColumns();
+			for(int i = 0; i < nRows; i++) {
+				if(ucCGT.isInSparseFormat() && ucCGT.getSparseBlock().isEmpty(i))
+					continue;
+				paCG.preAggregate(ucCGT, preAgg, i, i + 1);
+				preAgg.recomputeNonZeros();
+				LibMatrixMult.matrixMult(preAgg, dictM, tmpRes, true);
+
+				final int rowOut = _colIndexes[i];
+				for(int j = 0; j < nCols; j++) {
+					final int colOut = lhs._colIndexes[j] * retCols;
+					retV[rowOut + colOut] += tmpV[j];
+				}
+				if(i < nRows - 1) {
+					preAgg.reset(1, lhs.getNumValues());
+					tmpRes.reset(1, this.getNumCols());
 				}
 			}
 		}
+		else {
+			throw new NotImplementedException();
+		}
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
new file mode 100644
index 0000000..f33d2de
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+
+public class ColGroupUtils {
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the left
+	 * 
+	 * v op empty
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowLeft(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(v[colIndexes[i]], 0);
+		return ret;
+	}
+
+	/**
+	 * Calculate the result of performing the binary operation on an empty row to the right
+	 * 
+	 * empty op v
+	 * 
+	 * @param op         The operator
+	 * @param v          The values to use on the left side of the operator
+	 * @param colIndexes The column indexes to extract
+	 * @return The result as a double array.
+	 */
+	protected final static double[] binaryDefRowRight(BinaryOperator op, double[] v, int[] colIndexes) {
+		final ValueFunction fn = op.fn;
+		final int len = colIndexes.length;
+		final double[] ret = new double[len];
+		for(int i = 0; i < len; i++)
+			ret[i] = fn.execute(0, v[colIndexes[i]]);
+		return ret;
+	}
+
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 79be408..7ee7ed3 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -71,13 +71,73 @@ public abstract class ADictionary implements Serializable {
 	public abstract double aggregate(double init, Builtin fn);
 
 	/**
+	 * Aggregate all the contained values, with a reference offset.
+	 * 
+	 * @param init      The initial value, in cases such as Max value this could be -infinity.
+	 * @param fn        The function to apply to the values
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return The aggregated value as a double.
+	 */
+	public abstract double aggregate(double init, Builtin fn, double[] reference);
+
+	/**
 	 * Aggregate all entries in the rows.
 	 * 
 	 * @param fn   The aggregate function
 	 * @param nCol The number of columns contained in the dictionary.
 	 * @return Aggregates for this dictionary tuples.
 	 */
-	public abstract double[] aggregateTuples(Builtin fn, int nCol);
+	public abstract double[] aggregateRows(Builtin fn, int nCol);
+
+	/**
+	 * Aggregate all entries in the rows with an offset value reference added.
+	 * 
+	 * @param fn        The aggregate function
+	 * @param reference The reference offset to each value in the dictionary
+	 * @return Aggregates for this dictionary tuples.
+	 */
+	public abstract double[] aggregateRows(Builtin fn, double[] reference);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param reference  The reference offset values to add to each cell.
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference);
+
+	/**
+	 * Allocate a new dictionary and applies the scalar operation on each cell of the to then return the new.
+	 * 
+	 * @param op The operator.
+	 * @return The new dictionary to return.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op);
+
+	/**
+	 * Allocate a new dictionary and apply the scalar operation on each cell to then return a new dictionary.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * @param op           The operator to apply to each cell.
+	 * @param reference    The reference value to add before the operator.
+	 * @param newReference The reference value to subtract after the operator.
+	 * @return A New Dictionary.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference);
 
 	/**
 	 * Applies the scalar operation on the dictionary. Note that this operation modifies the underlying data, and
@@ -110,6 +170,23 @@ public abstract class ADictionary implements Serializable {
 	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes);
 
 	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(v[colIndexes[i]], this.values[j] + reference[i]) - newReference[i]
+	 * 
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the left side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
+	/**
 	 * Apply binary row operation on the right side.
 	 * 
 	 * @param op         The operation to this dictionary
@@ -120,6 +197,22 @@ public abstract class ADictionary implements Serializable {
 	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes);
 
 	/**
+	 * Apply the binary operator such that each value is offset by the reference before application. Then put the result
+	 * into the new dictionary, but offset it by the new reference.
+	 * 
+	 * outValues[j] = op(this.values[j] + reference[i], v[colIndexes[i]]) - newReference[i]
+	 * 
+	 * @param op           The operation to apply on the dictionary values.
+	 * @param v            The values to use on the right side of the operator.
+	 * @param colIndexes   The column indexes to use.
+	 * @param reference    The reference value to add before operator.
+	 * @param newReference The reference value to subtract after operator.
+	 * @return A new dictionary.
+	 */
+	public abstract ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference);
+
+	/**
 	 * Apply binary row operation on the left side and allocate a new dictionary.
 	 * 
 	 * While adding a new tuple, where the operation is applied with zero values.
@@ -131,7 +224,6 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes);
 
-
 	/**
 	 * Apply binary row operation on this dictionary on the right side.
 	 * 
@@ -156,16 +248,6 @@ public abstract class ADictionary implements Serializable {
 	public abstract ADictionary cloneAndExtend(int len);
 
 	/**
-	 * Aggregates the columns into the target double array provided.
-	 * 
-	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
-	 *                   this specific dictionary is needed.
-	 * @param fn         The function to apply to individual columns
-	 * @param colIndexes The mapping to the target columns from the individual columns
-	 */
-	public abstract void aggregateCols(double[] c, Builtin fn, int[] colIndexes);
-
-	/**
 	 * Write the dictionary to a DataOutput.
 	 * 
 	 * @param out the output sink to write the dictionary to.
@@ -200,21 +282,57 @@ public abstract class ADictionary implements Serializable {
 	 * 
 	 * Note if the number of columns is one the actual dictionaries values are simply returned.
 	 * 
-	 * @param square    If each entry should be squared.
+	 * 
+	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDouble(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * Note if the number of columns is one the actual dictionaries values are simply returned.
+	 * 
 	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
 	 * @return a double array containing the row sums from this dictionary.
 	 */
-	public abstract double[] sumAllRowsToDouble(boolean square, int nrColumns);
+	public abstract double[] sumAllRowsToDoubleSq(int nrColumns);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * @param reference The reference values to add to each cell.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	public abstract double[] sumAllRowsToDoubleSq(double[] reference);
 
 	/**
 	 * Sum the values at a specific row.
 	 * 
 	 * @param k         The row index to sum
-	 * @param square    If each entry should be squared.
 	 * @param nrColumns The number of columns
 	 * @return The sum of the row.
 	 */
-	public abstract double sumRow(int k, boolean square, int nrColumns);
+	public abstract double sumRow(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row, with a reference array to scale the values.
+	 * 
+	 * @param k         The row index to sum
+	 * @param nrColumns The number of columns
+	 * @param reference The reference vector to add to each cell processed.
+	 * @return The sum of the row.
+	 */
+	public abstract double sumRowSq(int k, int nrColumns, double[] reference);
 
 	/**
 	 * get the column sum of this dictionary only.
@@ -232,9 +350,29 @@ public abstract class ADictionary implements Serializable {
 	 * @param counts     The counts of the individual tuples.
 	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
 	 *                   the c output.
-	 * @param square     Specify if the values should be squared
 	 */
-	public abstract void colSum(double[] c, int[] counts, int[] colIndexes, boolean square);
+	public abstract void colSum(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes);
+
+	/**
+	 * Get the column sum of the values contained in the dictionary with an offset reference value added to each cell.
+	 * 
+	 * @param c          The output array allocated to contain all column groups output.
+	 * @param counts     The counts of the individual tuples.
+	 * @param colIndexes The columns indexes of the parent column group, this indicate where to put the column sum into
+	 *                   the c output.
+	 * @param reference  The reference values to add to each cell.
+	 */
+	public abstract void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference);
 
 	/**
 	 * Get the sum of the values contained in the dictionary
@@ -252,7 +390,16 @@ public abstract class ADictionary implements Serializable {
 	 * @param nCol   The number of columns contained
 	 * @return The square sum scaled by the counts provided.
 	 */
-	public abstract double sumsq(int[] counts, int nCol);
+	public abstract double sumSq(int[] counts, int nCol);
+
+	/**
+	 * Get the square sum of the values contained in the dictionary with a reference offset on each value.
+	 * 
+	 * @param counts    The counts of the individual tuples
+	 * @param reference The reference value
+	 * @return The square sum scaled by the counts and reference.
+	 */
+	public abstract double sumSq(int[] counts, double[] reference);
 
 	/**
 	 * Get a string representation of the dictionary, that considers the layout of the data.
@@ -299,6 +446,15 @@ public abstract class ADictionary implements Serializable {
 	public abstract boolean containsValue(double pattern);
 
 	/**
+	 * Detect if the dictionary contains a specific value with reference offset.
+	 * 
+	 * @param pattern   The pattern/ value to search for
+	 * @param reference The reference double array.
+	 * @return true if the value is contained else false.
+	 */
+	public abstract boolean containsValue(double pattern, double[] reference);
+
+	/**
 	 * Calculate the number of non zeros in the dictionary. The number of non zeros should be scaled with the counts
 	 * given. This gives the exact number of non zero values in the parent column group.
 	 * 
@@ -309,6 +465,20 @@ public abstract class ADictionary implements Serializable {
 	public abstract long getNumberNonZeros(int[] counts, int nCol);
 
 	/**
+	 * Calculate the number of non zeros in the dictionary.
+	 * 
+	 * Each value in the dictionary should be added to the reference value.
+	 * 
+	 * The number of non zeros should be scaled with the given counts.
+	 * 
+	 * @param counts    The Counts of each dict entry.
+	 * @param reference The reference vector.
+	 * @param nRows     The number of rows in the input.
+	 * @return The NonZero Count.
+	 */
+	public abstract long getNumberNonZeros(int[] counts, double[] reference, int nRows);
+
+	/**
 	 * Copies and adds the dictionary entry from this dictionary to the d dictionary
 	 * 
 	 * @param d    the target dictionary
@@ -380,6 +550,8 @@ public abstract class ADictionary implements Serializable {
 	 */
 	public abstract ADictionary replace(double pattern, double replace, int nCol);
 
+	public abstract ADictionary replace(double pattern, double replace, double[] reference);
+
 	public abstract ADictionary replaceZeroAndExtend(double replace, int nCol);
 
 	public abstract double product(int[] counts, int nCol);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 3707de7..f378756 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -80,7 +80,19 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		double ret = init;
+		for(int i = 0; i < _values.length; i++)
+			ret = fn.execute(ret, _values[i] + reference[i % nCol]);
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		if(nCol == 1)
 			return _values;
 		final int nRows = _values.length / nCol;
@@ -95,8 +107,47 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _values.length / nCol;
+		double[] res = new double[nRows + 1];
+		int off = 0;
+		for(int i = 0; i < nRows; i++) {
+			res[i] = _values[off++] + reference[0];
+			for(int j = 1; j < nCol; j++)
+				res[i] = fn.execute(res[i], _values[off++] + reference[j]);
+		}
+		res[nRows] = reference[0];
+		for(int i = 0; i < nCol; i++)
+			res[nRows] = fn.execute(res[nRows], reference[i]);
+		return res;
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op) {
+		final double[] retV = new double[_values.length];
+		for(int i = 0; i < _values.length; i++)
+			retV[i] = op.executeScalar(_values[i]);
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public Dictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = op.executeScalar(_values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
 	public Dictionary inplaceScalarOp(ScalarOperator op) {
-		// in-place modification of the dictionary
 		int len = size();
 		for(int i = 0; i < len; i++)
 			_values[i] = op.executeScalar(_values[i]);
@@ -126,6 +177,23 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(_values[off] + reference[j], v[colIndexes[j]]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
 	public final Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
 		final ValueFunction fn = op.fn;
 		final double[] retVals = new double[_values.length];
@@ -137,8 +205,25 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		final ValueFunction fn = op.fn;
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				retV[off] = fn.execute(v[colIndexes[j]], _values[off] + reference[j]) - newReference[j];
+				off++;
+			}
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
 	public Dictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -152,7 +237,7 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public final Dictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		ValueFunction fn = op.fn;
+		final ValueFunction fn = op.fn;
 		final int len = size();
 		final int lenV = colIndexes.length;
 		final double[] values = new double[len + lenV];
@@ -207,34 +292,67 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		// pre-aggregate value tuple
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		// pre-aggregate value tuple
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
+
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = getNumberOfValues(nCol);
+		double[] ret = new double[numVals + 1];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nCol, reference);
+		for(int i = 0; i < nCol; i++)
+			ret[numVals] += reference[i] * reference[i];
+		return ret;
+	}
 
-		int valOff = k * nrColumns;
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
 		double res = 0.0;
-		if(!square) {
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-		}
-		else {
-			// kSquare
-			for(int i = 0; i < nrColumns; i++)
-				res += _values[valOff + i] * _values[valOff + i];
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += _values[valOff + i] * _values[valOff + i];
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		final int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++) {
+			final double v = _values[valOff + i] + reference[i];
+			res += v * v;
 		}
 		return res;
 	}
@@ -252,44 +370,89 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
-		for(int k = 0; k < _values.length / colIndexes.length; k++) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		for(int k = 0; k < _values.length / nCol; k++) {
 			final int cntk = counts[k];
-			for(int j = 0; j < colIndexes.length; j++) {
-				double v = _values[k * colIndexes.length + j];
-				if(square)
-					c[colIndexes[j]] += v * v * cntk;
-				else
-					c[colIndexes[j]] += v * cntk;
+			final int off = k * nCol;
+			for(int j = 0; j < nCol; j++)
+				c[colIndexes[j]] += _values[off + j] * cntk;
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++];
+				c[colIndexes[j]] += v * v * cntk;
 			}
 		}
+	}
 
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = colIndexes.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int cntk = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off++] + reference[j];
+				c[colIndexes[j]] += v * v * cntk;
+			}
+		}
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
 	}
 
 	@Override
-	public double sum(int[] counts, int ncol) {
+	public double sum(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
+		for(int k = 0; k < _values.length / nCol; k++) {
 			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				out += getValue(valOff++) * countK;
+			for(int j = 0; j < nCol; j++) {
+				out += _values[valOff++] * countK;
 			}
 		}
 		return out;
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int nCol) {
 		double out = 0;
 		int valOff = 0;
-		for(int k = 0; k < _values.length / ncol; k++) {
-			int countK = counts[k];
-			for(int j = 0; j < ncol; j++) {
-				double val = getValue(valOff++);
+		for(int k = 0; k < _values.length / nCol; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++];
+				out += val * val * countK;
+			}
+		}
+		return out;
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		double out = 0;
+		int valOff = 0;
+		for(int k = 0; k < nRow; k++) {
+			final int countK = counts[k];
+			for(int j = 0; j < nCol; j++) {
+				final double val = _values[valOff++] + reference[j];
 				out += val * val * countK;
 			}
 		}
+		for(int i = 0; i < nCol; i++)
+			out += reference[i] * reference[i] * counts[nRow];
+
 		return out;
 	}
 
@@ -297,7 +460,7 @@ public class Dictionary extends ADictionary {
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 
-		sb.append("Dictionary:");
+		sb.append("Dictionary : ");
 		sb.append(Arrays.toString(_values));
 		return sb.toString();
 	}
@@ -384,6 +547,15 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		final int nCol = reference.length;
+		for(int i = 0; i < _values.length; i++)
+			if(_values[i] + reference[i % nCol] == pattern)
+				return true;
+		return false;
+	}
+
+	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
 		final int nRow = _values.length / nCol;
@@ -400,6 +572,27 @@ public class Dictionary extends ADictionary {
 	}
 
 	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		for(int i = 0; i < nRow; i++) {
+			long rowCount = 0;
+			final int off = i * nCol;
+			for(int j = off, jj = 0; j < off + nCol; j++, jj++) {
+				if(_values[j] + reference[jj] != 0)
+					rowCount++;
+			}
+			nnz += rowCount * counts[i];
+		}
+		for(int i = 0; i < nCol; i++)
+			if(reference[i] != 0)
+				nnz += counts[nRow];
+
+		return nnz;
+	}
+
+	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		final int sf = nCol * fr; // start from
 		final int ef = sf + nCol; // end from
@@ -432,10 +625,10 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public ADictionary subtractTuple(double[] tuple) {
-		double[] newValues = new double[_values.length];
-		for(int i = 0; i < _values.length; i++) {
+		double[] newValues = new double[_values.length - tuple.length];
+		for(int i = 0; i < _values.length- tuple.length; i++)
 			newValues[i] = _values[i] - tuple[i % tuple.length];
-		}
+		
 		return new Dictionary(newValues);
 	}
 
@@ -446,12 +639,22 @@ public class Dictionary extends ADictionary {
 
 	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
-		int ncol = colIndexes.length;
-		int vlen = size() / ncol;
-		for(int k = 0; k < vlen; k++)
-			for(int j = 0, valOff = k * ncol; j < ncol; j++)
-				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], getValue(valOff + j));
+		final int nCol = colIndexes.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j]);
+	}
 
+	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int rlen = _values.length / nCol;
+		for(int k = 0; k < rlen; k++)
+			for(int j = 0, valOff = k * nCol; j < nCol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], _values[valOff + j] + reference[j]);
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] = fn.execute(c[colIndexes[i]], reference[i]);
 	}
 
 	@Override
@@ -488,10 +691,23 @@ public class Dictionary extends ADictionary {
 		double[] retV = new double[_values.length];
 		for(int i = 0; i < _values.length; i++) {
 			final double v = _values[i];
-			if(v == pattern)
-				retV[i] = replace;
-			else
-				retV[i] = v;
+			retV[i] = v == pattern ? replace : v;
+		}
+		return new Dictionary(retV);
+	}
+
+	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final double[] retV = new double[_values.length];
+		final int nCol = reference.length;
+		final int nRow = _values.length / nCol;
+		int off = 0;
+		for(int i = 0; i < nRow; i++) {
+			for(int j = 0; j < nCol; j++) {
+				final double v = _values[off];
+				retV[off++] = v + reference[j] == pattern ? replace - reference[j] : v;
+
+			}
 		}
 		return new Dictionary(retV);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index 1db433c..236c0f4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -24,6 +24,8 @@ import java.io.IOException;
 import java.util.ArrayList;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.bitmap.ABitmap;
 import org.apache.sysds.runtime.compress.bitmap.Bitmap;
@@ -35,8 +37,7 @@ import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 public class DictionaryFactory {
-
-	// protected static final Log LOG = LogFactory.getLog(DictionaryFactory.class.getName());
+	protected static final Log LOG = LogFactory.getLog(DictionaryFactory.class.getName());
 
 	public enum Type {
 		FP64_DICT, MATRIX_BLOCK_DICT, INT8_DICT
@@ -74,7 +75,6 @@ public class DictionaryFactory {
 			final DArrCounts dac = vals.get(i);
 			System.arraycopy(dac.key.getData(), 0, resValues, dac.id * nCols, nCols);
 		}
-
 		return new Dictionary(resValues);
 	}
 
@@ -171,6 +171,8 @@ public class DictionaryFactory {
 			else if(mb.isInSparseFormat()) {
 				MatrixBlockDictionary mbdn = moveToLastDictionaryEntrySparse(mb.getSparseBlock(), largestIndex, zeros, nCol,
 					largestIndexSize);
+				if(mbdn == null)
+					return null;
 				MatrixBlock mbn = mbdn.getMatrixBlock();
 				mbn.setNonZeros(mb.getNonZeros());
 				if(mbn.getNonZeros() == 0)
@@ -196,6 +198,8 @@ public class DictionaryFactory {
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				sb.set(i - 1, sb.get(i), false);
 			sb.set(sb.numRows() - 1, swap, false);
+			if(ret.isEmpty())
+				return null;
 			return new MatrixBlockDictionary(ret);
 		}
 
@@ -214,6 +218,8 @@ public class DictionaryFactory {
 			for(int i = indexToMove + 1; i < sb.numRows(); i++)
 				retB.set(i - 1, sb.get(i), false);
 		}
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index b3fa6f7..b5c826a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -25,13 +25,13 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
-import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
@@ -45,10 +45,14 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	public MatrixBlockDictionary(double[] values, int nCol) {
 		_data = Util.matrixBlockFromDenseArray(values, nCol);
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlockDictionary(MatrixBlock data) {
 		_data = data;
+		if(_data.isEmpty())
+			throw new DMLCompressionException("Invalid construction of empty dictionary");
 	}
 
 	public MatrixBlock getMatrixBlock() {
@@ -93,7 +97,45 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		double ret = init;
+
+		for(int i = 0; i < nCol; i++)
+			ret = fn.execute(ret, reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				for(int k = apos; k < alen; k++) {
+					final double v = avals[k] + reference[aix[k]];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret = fn.execute(ret, v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, int nCol) {
 		double[] ret = new double[_data.getNumRows()];
 		if(_data.isEmpty())
 			return ret;
@@ -130,6 +172,53 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		final int nCol = reference.length;
+		final int nRows = _data.getNumRows();
+		final double[] ret = new double[nRows + 1];
+
+		ret[nRows] = reference[0];
+		for(int i = 1; i < nCol; i++)
+			ret[nRows] = fn.execute(ret[nRows], reference[i]);
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRows; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[nRows];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 1;
+					ret[i] = (aix[k] == 0) ? avals[k++] + reference[0] : reference[0];
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] = fn.execute(ret[i], v);
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] = fn.execute(ret[i], reference[j]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRows; k++) {
+				ret[k] = values[off++] + reference[0];
+				for(int j = 1; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] = fn.execute(ret[k], v);
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
 	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
 		if(_data.isEmpty()) {
 			for(int j = 0; j < colIndexes.length; j++) {
@@ -172,9 +261,102 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public ADictionary inplaceScalarOp(ScalarOperator op) {
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+
+		for(int j = 0; j < colIndexes.length; j++) {
+			final int idx = colIndexes[j];
+			c[idx] = fn.execute(c[idx], reference[j]);
+		}
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] avals = sb.values(i);
+				final int[] aix = sb.indexes(i);
+				// This is a cool trick but it only works with min / max.
+				for(int k = apos; k < alen; k++) {
+					final int idx = colIndexes[aix[k]];
+					c[idx] = fn.execute(c[idx], avals[k] + reference[aix[k]]);
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				for(int j = 0; j < nCol; j++) {
+					final int idx = colIndexes[j];
+					c[idx] = fn.execute(c[idx], values[off++] + reference[j]);
+				}
+			}
+		}
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op) {
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-		return new MatrixBlockDictionary(res);
+		if(res.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(res);
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		final int nCol = _data.getNumColumns();
+		final int nRow = _data.getNumRows();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = op.executeScalar(v) - newReference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = op.executeScalar(reference[j]) - newReference[j];
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					retV[off] = op.executeScalar(values[off] + reference[j]) - newReference[j];
+					off++;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
+	@Override
+	public ADictionary inplaceScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -182,21 +364,30 @@ public class MatrixBlockDictionary extends ADictionary {
 		MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
 		final int lastRow = res.getNumRows();
 		MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), true);
-		if(res.isEmpty()) {
+		if(res.isEmpty())
 			for(int i = 0; i < numCols; i++)
 				res2.appendValue(lastRow, i, newVal);
-			return new MatrixBlockDictionary(res2);
-		}
-		else {
+		else
 			res.append(new MatrixBlock(1, numCols, newVal), res2, false);
+
+		if(res2.isEmpty())
+			return null;
+		else
 			return new MatrixBlockDictionary(res2);
-		}
 	}
 
 	@Override
 	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
-		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
-		return new MatrixBlockDictionary(rowVector.binaryOperations(op, _data, null));
+		throw new NotImplementedException("Binary row op left is not supported for Uncompressed Matrix, "
+			+ "Implement support for VMr in MatrixBLock Binary Cell operations");
+		// MatrixBlock rowVector = Util.extractValues(v, colIndexes);
+		// return new MatrixBlockDictionary(rowVector.binaryOperations(op, _data, null));
+	}
+
+	@Override
+	public Dictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -213,6 +404,12 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
+	public Dictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
 		MatrixBlock rowVector = Util.extractValues(v, colIndexes);
 		MatrixBlock tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), null, false);
@@ -242,7 +439,7 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
+	public double[] sumAllRowsToDouble(int nrColumns) {
 		double[] ret = new double[_data.getNumRows()];
 
 		if(_data.isEmpty())
@@ -255,7 +452,7 @@ public class MatrixBlockDictionary extends ADictionary {
 					final int alen = sb.size(i) + apos;
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						ret[i] += (square) ? avals[j] * avals[j] : avals[j];
+						ret[i] += avals[j];
 					}
 				}
 			}
@@ -266,7 +463,7 @@ public class MatrixBlockDictionary extends ADictionary {
 			for(int k = 0; k < _data.getNumRows(); k++) {
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					ret[k] += (square) ? v * v : v;
+					ret[k] += v;
 				}
 			}
 		}
@@ -274,7 +471,95 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final double[] ret = new double[_data.getNumRows()];
+
+		if(_data.isEmpty())
+			return ret;
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						ret[i] += avals[j] * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					ret[k] += v * v;
+				}
+			}
+		}
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		final double[] ret = new double[numVals + 1];
+
+		final int finalIndex = numVals;
+		for(int i = 0; i < nCol; i++)
+			ret[finalIndex] += reference[i] * reference[i];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				if(sb.isEmpty(i))
+					ret[i] = ret[finalIndex];
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret[i] += v * v;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret[i] += reference[j] * reference[j];
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret[k] += v * v;
+				}
+			}
+		}
+
+		return ret;
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
 		throw new NotImplementedException();
 	}
 
@@ -314,7 +599,40 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
+		if(_data.isEmpty())
+			return;
+		if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(!sb.isEmpty(i)) {
+					// double tmpSum = 0;
+					final int count = counts[i];
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					for(int j = apos; j < alen; j++) {
+						c[colIndexes[aix[j]]] += count * avals[j];
+					}
+				}
+			}
+		}
+		else {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < _data.getNumRows(); k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++];
+					c[colIndexes[j]] += v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
 		if(_data.isEmpty())
 			return;
 		if(_data.isInSparseFormat()) {
@@ -328,7 +646,7 @@ public class MatrixBlockDictionary extends ADictionary {
 					final int[] aix = sb.indexes(i);
 					final double[] avals = sb.values(i);
 					for(int j = apos; j < alen; j++) {
-						c[colIndexes[aix[j]]] += square ? count * avals[j] * avals[j] : count * avals[j];
+						c[colIndexes[aix[j]]] += count * avals[j] * avals[j];
 					}
 				}
 			}
@@ -340,7 +658,50 @@ public class MatrixBlockDictionary extends ADictionary {
 				final int countK = counts[k];
 				for(int j = 0; j < _data.getNumColumns(); j++) {
 					final double v = values[off++];
-					c[colIndexes[j]] += square ? v * v * countK : v * countK;
+					c[colIndexes[j]] += v * v * countK;
+				}
+			}
+		}
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		final int nCol = reference.length;
+		final int nRow = _data.getNumRows();
+		for(int i = 0; i < nCol; i++)
+			c[colIndexes[i]] += reference[i] * reference[i] * counts[nRow];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						c[colIndexes[j]] += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						c[colIndexes[j]] += reference[j] * reference[j] * countK;
+				}
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < nRow; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					c[colIndexes[j]] += v * v * countK;
 				}
 			}
 		}
@@ -380,7 +741,7 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
 		double tmpSum = 0;
 		if(_data.isEmpty())
 			return tmpSum;
@@ -413,8 +774,51 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
-	public String getString(int colIndexes) {
-		return _data.toString();
+	public double sumSq(int[] counts, double[] reference) {
+		final int nCol = reference.length;
+		final int numVals = _data.getNumRows();
+		double ret = 0;
+		for(int i = 0; i < nCol; i++)
+			ret += reference[i] * reference[i];
+		final double ref = ret;
+		ret *= counts[numVals];
+
+		if(!_data.isEmpty() && _data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < numVals; i++) {
+				final int countK = counts[i];
+				if(sb.isEmpty(i))
+					ret += ref * countK;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						ret += v * v * countK;
+					}
+					for(; j < _data.getNumColumns(); j++)
+						ret += reference[j] * reference[j] * countK;
+				}
+
+			}
+		}
+		else if(!_data.isEmpty()) {
+			double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int k = 0; k < numVals; k++) {
+				final int countK = counts[k];
+				for(int j = 0; j < _data.getNumColumns(); j++) {
+					final double v = values[off++] + reference[j];
+					ret += v * v * countK;
+				}
+			}
+		}
+
+		return ret;
 	}
 
 	@Override
@@ -439,6 +843,53 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+
+		if(_data.isEmpty()) {
+			for(double d : reference)
+				if(pattern == d)
+					return true;
+			return false;
+		}
+		else if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					continue;
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final int[] aix = sb.indexes(i);
+				final double[] avals = sb.values(i);
+				int k = apos;
+				int j = 0;
+				for(; j < _data.getNumColumns() && k < alen; j++) {
+					if(aix[k] == j) {
+						if(reference[j] + avals[k++] == pattern)
+							return true;
+					}
+					else {
+						if(reference[j] == pattern)
+							return true;
+					}
+				}
+				for(; j < _data.getNumColumns(); j++)
+					if(reference[j] == pattern)
+						return true;
+
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			final int nCol = reference.length;
+			for(int i = 0; i < values.length; i++)
+				if(values[i] + reference[i % nCol] == pattern)
+					return true;
+
+		}
+		return false;
+	}
+
+	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		if(_data.isEmpty())
 			return 0;
@@ -449,7 +900,6 @@ public class MatrixBlockDictionary extends ADictionary {
 			for(int i = 0; i < _data.getNumRows(); i++)
 				if(!sb.isEmpty(i))
 					nnz += sb.size(i) * counts[i];
-
 		}
 		else {
 			double[] values = _data.getDenseBlockValues();
@@ -468,6 +918,64 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		long nnz = 0;
+		for(double d : reference)
+			if(d != 0)
+				nnz++;
+		if(_data.isEmpty()) {
+			// sum counts
+			return nnz * nRows;
+		}
+		else if(_data.isInSparseFormat()) {
+			SparseBlock sb = _data.getSparseBlock();
+			long emptyRowNNZ = nnz;
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				if(sb.isEmpty(i))
+					nnz += emptyRowNNZ * counts[i];
+				else {
+					int countThis = 0;
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int k = apos;
+					int j = 0;
+					for(; j < _data.getNumColumns() && k < alen; j++) {
+						if(aix[k] == j) {
+							if(reference[j] + avals[k++] != 0)
+								countThis++;
+						}
+						else {
+							if(reference[j] != 0)
+								countThis++;
+						}
+					}
+					for(; j < _data.getNumColumns(); j++)
+						if(reference[j] != 0)
+							countThis++;
+
+					nnz += countThis * counts[i];
+				}
+			}
+		}
+		else {
+			nnz *= counts[counts.length - 1]; // multiply count with the common value count in reference.
+			final double[] values = _data.getDenseBlockValues();
+			int off = 0;
+			for(int i = 0; i < _data.getNumRows(); i++) {
+				int countThisTuple = 0;
+				for(int j = 0; j < _data.getNumColumns(); j++)
+					if(values[off++] + reference[j] != 0)
+						countThisTuple++;
+				nnz += countThisTuple * counts[i];
+			}
+		}
+		return nnz;
+	}
+
+	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		double[] v = d.getValues();
 		if(_data.isEmpty())
@@ -525,11 +1033,27 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	@Override
 	public ADictionary subtractTuple(double[] tuple) {
-		DenseBlockFP64 b = new DenseBlockFP64(new int[] {1, tuple.length}, tuple);
-		MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
-		MatrixBlock res = new MatrixBlock(_data.getNumColumns(), _data.getNumRows(), _data.isInSparseFormat());
-		_data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), rowVector, res);
-		return new MatrixBlockDictionary(res);
+		if(_data.isEmpty())
+			throw new NotImplementedException("Should not extract from empty matrix");
+		else if(_data.isInSparseFormat()) {
+			throw new NotImplementedException("Not supporting extracting from sparse matrix yet");
+		}
+		else {
+			final int nRow = _data.getNumRows() - 1;
+			final int nCol = _data.getNumColumns();
+			double[] values = _data.getDenseBlockValues();
+			MatrixBlock res = new MatrixBlock(nCol, nRow, false);
+			res.allocateBlock();
+			double[] resVals = res.getDenseBlockValues();
+			for(int i = 0, off = 0; i < nRow; i++)
+				for(int j = 0; j < nCol; j++, off++)
+					resVals[off] = values[off] - tuple[j];
+
+			res.examSparsity();
+			if(res.isEmpty())
+				return null;
+			return new MatrixBlockDictionary(res);
+		}
 	}
 
 	@Override
@@ -539,8 +1063,19 @@ public class MatrixBlockDictionary extends ADictionary {
 	}
 
 	@Override
+	public String getString(int colIndexes) {
+		if(_data.isInSparseFormat() || _data.getNumColumns() > 1)
+			return "\n" + _data.toString();
+		else
+			return Arrays.toString(_data.getDenseBlockValues());
+	}
+
+	@Override
 	public String toString() {
-		return "MatrixBlock Dictionary :" + _data.toString();
+		if(_data.isInSparseFormat() || _data.getNumColumns() > 1)
+			return "MatrixBlock Dictionary :\n" + _data.toString();
+		else
+			return "MatrixBlock Dictionary : " + Arrays.toString(_data.getDenseBlockValues());
 	}
 
 	@Override
@@ -645,7 +1180,7 @@ public class MatrixBlockDictionary extends ADictionary {
 
 		DenseBlock dictV = new DenseBlockFP64(new int[] {numVals, aggregateColumns.length}, ret);
 		MatrixBlock dictM = new MatrixBlock(numVals, aggregateColumns.length, dictV);
-		dictM.getNonZeros();
+		dictM.recomputeNonZeros();
 		dictM.examSparsity();
 		return new MatrixBlockDictionary(dictM);
 
@@ -653,16 +1188,66 @@ public class MatrixBlockDictionary extends ADictionary {
 
 	@Override
 	public ADictionary replace(double pattern, double replace, int nCol) {
-		MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		final MatrixBlock ret = _data.replaceOperations(new MatrixBlock(), pattern, replace);
+		if(ret.isEmpty())
+			return null;
 		return new MatrixBlockDictionary(ret);
 	}
 
 	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		final int nRow = _data.getNumRows();
+		final int nCol = _data.getNumColumns();
+		final MatrixBlock ret = new MatrixBlock(nRow, nCol, false);
+		ret.allocateDenseBlock();
+		final double[] retV = ret.getDenseBlockValues();
+		int off = 0;
+		if(_data.isInSparseFormat()) {
+			final SparseBlock sb = _data.getSparseBlock();
+			for(int i = 0; i < nRow; i++) {
+				if(sb.isEmpty(i))
+					for(int j = 0; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				else {
+					final int apos = sb.pos(i);
+					final int alen = sb.size(i) + apos;
+					final int[] aix = sb.indexes(i);
+					final double[] avals = sb.values(i);
+					int j = 0;
+					for(int k = apos; j < nCol && k < alen; j++) {
+						final double v = aix[k] == j ? avals[k++] + reference[j] : reference[j];
+						retV[off++] = pattern == v ? replace - reference[j] : v - reference[j];
+					}
+					for(; j < nCol; j++)
+						retV[off++] = pattern == reference[j] ? replace - reference[j] : 0;
+				}
+			}
+		}
+		else {
+			final double[] values = _data.getDenseBlockValues();
+			for(int i = 0; i < nRow; i++) {
+				for(int j = 0; j < nCol; j++) {
+					final double v = values[off];
+					retV[off++] = pattern == v + reference[j] ? replace - reference[j] : v;
+				}
+			}
+		}
+
+		ret.recomputeNonZeros();
+		ret.examSparsity();
+		if(ret.isEmpty())
+			return null;
+		else
+			return new MatrixBlockDictionary(ret);
+
+	}
+
+	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		final int nRows = _data.getNumRows();
 		final int nCols = _data.getNumColumns();
 		final long nonZerosOut = (nRows + 1) * nCols;
-		final MatrixBlock ret = new MatrixBlock(_data.getNumRows() + 1, _data.getNumColumns(), false);
+		final MatrixBlock ret = new MatrixBlock(nRows + 1, nCols, false);
 		ret.allocateBlock();
 		ret.setNonZeros(nonZerosOut);
 		final double[] retValues = ret.getDenseBlockValues();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index bfab527..879892a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -101,7 +101,12 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] aggregateTuples(Builtin fn, final int nCol) {
+	public double aggregate(double init, Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double[] aggregateRows(Builtin fn, final int nCol) {
 		if(nCol == 1)
 			return getValues();
 		final int nRows = _values.length / nCol;
@@ -116,6 +121,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public double[] aggregateRows(Builtin fn, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public QDictionary inplaceScalarOp(ScalarOperator op) {
 		if(_values == null)
 			return this;
@@ -155,6 +165,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public QDictionary applyScalarOp(ScalarOperator op) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public QDictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
 		double[] temp = getValues();
 		double max = Math.abs(newVal);
@@ -219,39 +234,60 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
-	public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
-		if(nrColumns == 1 && !square)
+	public double[] sumAllRowsToDouble(int nrColumns) {
+		if(nrColumns == 1)
 			return getValues(); // shallow copy of values
 
 		final int numVals = getNumberOfValues(nrColumns);
 		double[] ret = new double[numVals];
-		for(int k = 0; k < numVals; k++) {
-			ret[k] = sumRow(k, square, nrColumns);
-		}
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRow(k, nrColumns);
 
 		return ret;
 	}
 
 	@Override
-	public double sumRow(int k, boolean square, int nrColumns) {
+	public double[] sumAllRowsToDoubleSq(int nrColumns) {
+		final int numVals = getNumberOfValues(nrColumns);
+		double[] ret = new double[numVals];
+		for(int k = 0; k < numVals; k++)
+			ret[k] = sumRowSq(k, nrColumns);
+		return ret;
+	}
+
+	@Override
+	public double[] sumAllRowsToDoubleSq(double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public double sumRow(int k, int nrColumns) {
 		if(_values == null)
 			return 0;
 		int valOff = k * nrColumns;
 
-		if(!square) {
-			int res = 0;
-			for(int i = 0; i < nrColumns; i++) {
-				res += _values[valOff + i];
-			}
-			return res * _scale;
-		}
-		else {
-			// kSquare
-			double res = 0.0;
-			for(int i = 0; i < nrColumns; i++)
-				res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
-			return res;
+		int res = 0;
+		for(int i = 0; i < nrColumns; i++) {
+			res += _values[valOff + i];
 		}
+		return res * _scale;
+
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns) {
+		if(_values == null)
+			return 0;
+		int valOff = k * nrColumns;
+		double res = 0.0;
+		for(int i = 0; i < nrColumns; i++)
+			res += (int) (_values[valOff + i] * _values[valOff + i]) * _scale * _scale;
+		return res;
+	}
+
+	@Override
+	public double sumRowSq(int k, int nrColumns, double[] reference) {
+		throw new NotImplementedException();
 	}
 
 	@Override
@@ -260,17 +296,32 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
-	public void colSum(double[] c, int[] counts, int[] colIndexes, boolean square) {
+	public void colSum(double[] c, int[] counts, int[] colIndexes) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
 	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public void colSumSq(double[] c, int[] counts, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public double sum(int[] counts, int ncol) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
 	@Override
-	public double sumsq(int[] counts, int ncol) {
+	public double sumSq(int[] counts, int ncol) {
+		throw new NotImplementedException("Not Implemented");
+	}
+
+	@Override
+	public double sumSq(int[] counts, double[] reference) {
 		throw new NotImplementedException("Not Implemented");
 	}
 
@@ -342,6 +393,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public boolean containsValue(double pattern, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
 		long nnz = 0;
 		final int nRow = _values.length / nCol;
@@ -358,6 +414,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public long getNumberNonZeros(int[] counts, double[] reference, int nRows) {
+		throw new NotImplementedException("not implemented yet");
+	}
+
+	@Override
 	public void addToEntry(Dictionary d, int fr, int to, int nCol) {
 		throw new NotImplementedException("Not implemented yet");
 	}
@@ -388,6 +449,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public ADictionary scaleTuples(int[] scaling, int nCol) {
 		throw new NotImplementedException();
 	}
@@ -404,6 +470,11 @@ public class QDictionary extends ADictionary {
 	}
 
 	@Override
+	public ADictionary replace(double pattern, double replace, double[] reference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
 	public ADictionary replaceZeroAndExtend(double replace, int nCol) {
 		throw new NotImplementedException();
 	}
@@ -420,25 +491,38 @@ public class QDictionary extends ADictionary {
 
 	@Override
 	public ADictionary applyBinaryRowOpLeftAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
 	}
 
 	@Override
 	public ADictionary applyBinaryRowOpRightAppendNewEntry(BinaryOperator op, double[] v, int[] colIndexes) {
-		// TODO Auto-generated method stub
-		return null;
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary applyScalarOp(ScalarOperator op, double[] reference, double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpLeft(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public ADictionary binOpRight(BinaryOperator op, double[] v, int[] colIndexes, double[] reference,
+		double[] newReference) {
+		throw new NotImplementedException();
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/AInsertionSorter.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/AInsertionSorter.java
index abd248c..b43bcfb 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/AInsertionSorter.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/AInsertionSorter.java
@@ -47,7 +47,7 @@ public abstract class AInsertionSorter {
 	public AInsertionSorter(int endLength, int numRows, IntArrayList[] offsets) {
 		_indexes = new int[endLength];
 		_numLabels = offsets.length;
-		_labels = MapToFactory.create(endLength, _numLabels );
+		_labels = MapToFactory.create(endLength, _numLabels);
 		_numRows = numRows;
 		_offsets = offsets;
 		_negativeIndex = -1;
@@ -56,7 +56,7 @@ public abstract class AInsertionSorter {
 	public AInsertionSorter(int endLength, int numRows, IntArrayList[] offsets, int negativeIndex) {
 		_indexes = new int[endLength];
 		_numLabels = offsets.length;
-		_labels = MapToFactory.create(endLength, _numLabels );
+		_labels = MapToFactory.create(endLength, _numLabels);
 		_numRows = numRows;
 		_offsets = offsets;
 		_negativeIndex = negativeIndex;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/InsertionSorterFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/InsertionSorterFactory.java
index 2452588..1cd0c6f 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/InsertionSorterFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/InsertionSorterFactory.java
@@ -19,9 +19,12 @@
 
 package org.apache.sysds.runtime.compress.colgroup.insertionsort;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
 public class InsertionSorterFactory {
+	protected static final Log LOG = LogFactory.getLog(InsertionSorterFactory.class.getName());
 
 	public enum SORT_TYPE {
 		MERGE, MATERIALIZE;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
index 2d9c5b8..e3d533d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/insertionsort/MaterializeSort.java
@@ -24,7 +24,7 @@ import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
 public class MaterializeSort extends AInsertionSorter {
-	public static int CACHE_BLOCK = 1000;
+	public static int CACHE_BLOCK = 50000;
 
 	/** a dense mapToData, that have a value for each row in the input. */
 	private final AMapToData md;
@@ -34,9 +34,8 @@ public class MaterializeSort extends AInsertionSorter {
 	protected MaterializeSort(int endLength, int numRows, IntArrayList[] offsets) {
 		super(endLength, numRows, offsets);
 
-		md = MapToFactory.create(Math.min(_numRows, CACHE_BLOCK), _numLabels);
+		md = MapToFactory.create(Math.min(_numRows, CACHE_BLOCK), Math.max(_numLabels, 3));
 		skip = new int[offsets.length];
-
 		for(int block = 0; block < _numRows; block += CACHE_BLOCK) {
 			md.fill(_numLabels);
 			insert(block, Math.min(block + CACHE_BLOCK, _numRows));
@@ -46,7 +45,7 @@ public class MaterializeSort extends AInsertionSorter {
 	protected MaterializeSort(int endLength, int numRows, IntArrayList[] offsets, int negativeIndex) {
 		super(endLength, numRows, offsets, negativeIndex);
 
-		md = MapToFactory.create(Math.min(_numRows, CACHE_BLOCK), _numLabels);
+		md = MapToFactory.create(Math.min(_numRows, CACHE_BLOCK), Math.max(_numLabels, 3));
 		skip = new int[offsets.length];
 
 		for(int block = 0; block < _numRows; block += CACHE_BLOCK) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index d3310fe..d430d48 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -25,12 +25,22 @@ import java.io.Serializable;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
+/**
+ * This Class's job is to link into the dictionary entries for column groups.
+ * 
+ * Column groups
+ * 
+ * - DDC use this to map to map directly to the dictionary
+ * 
+ * - SDC use this in collaboration with the offsets to only point to dictionary entries for non default values.
+ */
 public abstract class AMapToData implements Serializable {
 
-	private static final long serialVersionUID = 100512759972844714L;
-
 	protected static final Log LOG = LogFactory.getLog(AMapToData.class.getName());
 
 	/** Number of unique values inside this map. */
@@ -63,7 +73,7 @@ public abstract class AMapToData implements Serializable {
 	 * 
 	 * @param nUnique the value to set.
 	 */
-	protected final void setUnique(int nUnique) {
+	public final void setUnique(int nUnique) {
 		this.nUnique = nUnique;
 	}
 
@@ -145,14 +155,83 @@ public abstract class AMapToData implements Serializable {
 	/**
 	 * Pre aggregate a dense matrix m into pre, subject to only including a row segment and column segment.
 	 * 
-	 * @param m   The dense matrix values to preaggregate
-	 * @param pre The preAggregate to populate with the summed values of m
-	 * @param rl  The row start in m
-	 * @param ru  The row end in m
-	 * @param cl  The column start in m
-	 * @param cu  The column end in m
+	 * @param m     The dense matrix values to preaggregate
+	 * @param preAV The preAggregate double array populate with the summed values of m
+	 * @param rl    The row start in m
+	 * @param ru    The row end in m
+	 * @param cl    The column start in m
+	 * @param cu    The column end in m
+	 */
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final DenseBlock db = m.getDenseBlock();
+		if(rl == ru - 1)
+			preAggregateDenseToRow(db.values(rl), db.pos(rl), preAV, cl, cu);
+		else
+			preAggregateDenseRows(m, preAV, rl, ru, cl, cu);
+	}
+
+	/**
+	 * PreAggregate Dense on a single row.
+	 * 
+	 * @param mV    The DenseMatrix Values from the input matrix block for the specific row given
+	 * @param off   The offset into the mV that the row values start from
+	 * @param preAV The PreAggregate value target to preAggregate into
+	 * @param cl    The column index to start at
+	 * @param cu    The column index to stop at (not inclusive)
+	 */
+	protected abstract void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu);
+
+	/**
+	 * PreAggregate from Dense Matrix, and handle multiple rows,
+	 * 
+	 * @param m     The Matrix to preAggregate.
+	 * @param preAV The target dense array to preAggregate into
+	 * @param rl    The row to start at
+	 * @param ru    The row to end at (not inclusive)
+	 * @param cl    The column to start at
+	 * @param cu    The column to end at (not inclusive)
+	 */
+	protected abstract void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu);
+
+	/**
+	 * PreAggregate a Dense Matrix at index offsets.
+	 * 
+	 * @param m       The DenseBlock to preAggregate
+	 * @param preAV   The target double array to put the preAggregate into
+	 * @param rl      The row to start at
+	 * @param ru      The row to end at (not inclusive)
+	 * @param cl      The column in m to start from
+	 * @param cu      The column in m to end at (not inclusive)
+	 * @param indexes The Offset Indexes to iterate through
+	 */
+	public abstract void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu,
+		AOffset indexes);
+
+	/**
+	 * PreAggregate the SparseBlock in the range of rows given.
+	 * 
+	 * @param sb      The SparseBlock to preAggregate
+	 * @param preAV   The target double array to put the preAggregate into
+	 * @param rl      The row to start at
+	 * @param ru      The row to end at (not inclusive)
+	 * @param indexes The Offset Indexes to iterate through
+	 */
+	public abstract void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes);
+
+	/**
+	 * Get the number of counts of each unique value contained in this map.
+	 * 
+	 * @param counts The object to return.
+	 * @param nRows  The number of rows in the calling column group.
+	 * @return the Counts
 	 */
-	public abstract void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu);
+	public int[] getCounts(int[] counts, int nRows) {
+		final int nonDefaultLength = size();
+		for(int i = 0; i < nonDefaultLength; i++)
+			counts[getIndex(i)]++;
+		counts[counts.length - 1] += nRows - nonDefaultLength;
+		return counts;
+	}
 
 	/**
 	 * Copy the values in this map into another mapping object.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
index 678ee65..baaf378 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
@@ -24,7 +24,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.BitSet;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -36,7 +40,7 @@ public class MapToBit extends AMapToData {
 	private final int _size;
 
 	public MapToBit(int unique, int size) {
-		super(unique);
+		super(Math.min(unique, 2));
 		_data = new BitSet(size);
 		_size = size;
 	}
@@ -62,7 +66,7 @@ public class MapToBit extends AMapToData {
 		return getInMemorySize(_data.size());
 	}
 
-	public static long getInMemorySize(int dataLength) {
+	protected static long getInMemorySize(int dataLength) {
 		long size = 16 + 8 + 4; // object header + object reference + int size
 		size += MemoryEstimates.bitSetCost(dataLength);
 		return size;
@@ -107,7 +111,7 @@ public class MapToBit extends AMapToData {
 			out.writeLong(internals[i]);
 	}
 
-	public static MapToBit readFields(DataInput in) throws IOException {
+	protected static MapToBit readFields(DataInput in) throws IOException {
 		int unique = in.readInt();
 		int size = in.readInt();
 		long[] internalLong = new long[in.readInt()];
@@ -118,24 +122,45 @@ public class MapToBit extends AMapToData {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++)
-					preAV[_data.get(rc) ? offOut + 1 : offOut] += mV[offLeft + rc];
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data.get(rc) ? 1 : 0] += mV[off];
+	}
+
+	@Override
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final int nVal = getUnique();
+		final DenseBlock db = m.getDenseBlock();
+		if(db.isContiguous()) {
+			final double[] mV = m.getDenseBlockValues();
+			final int nCol = m.getNumColumns();
+			for(int c = cl; c < cu; c++) {
+				final int idx = getIndex(c);
+				final int start = c + nCol * rl;
+				final int end = c + nCol * ru;
+				for(int offOut = idx, off = start; off < end; offOut += nVal, off += nCol) {
+					preAV[offOut] += mV[off];
+				}
 			}
 		}
+		else
+			throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
 	}
 
 	@Override
 	public int getUpperBoundValue() {
 		return 1;
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
index 5bd1e64..5564cca 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
@@ -24,7 +24,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -35,7 +39,7 @@ public class MapToByte extends AMapToData {
 	private final byte[] _data;
 
 	public MapToByte(int unique, int size) {
-		super(unique);
+		super(Math.min(unique, 256));
 		_data = new byte[size];
 	}
 
@@ -59,7 +63,7 @@ public class MapToByte extends AMapToData {
 		return getInMemorySize(_data.length);
 	}
 
-	public static long getInMemorySize(int dataLength) {
+	protected static long getInMemorySize(int dataLength) {
 		long size = 16 + 8; // object header + object reference
 		size += MemoryEstimates.byteArrayCost(dataLength);
 		return size;
@@ -89,7 +93,7 @@ public class MapToByte extends AMapToData {
 			out.writeByte(_data[i]);
 	}
 
-	public static MapToByte readFields(DataInput in) throws IOException {
+	protected static MapToByte readFields(DataInput in) throws IOException {
 		int unique = in.readInt();
 		final int length = in.readInt();
 		final byte[] data = new byte[length];
@@ -98,10 +102,6 @@ public class MapToByte extends AMapToData {
 		return new MapToByte(unique, data);
 	}
 
-	public byte[] getBytes() {
-		return _data;
-	}
-
 	@Override
 	public void replace(int v, int r) {
 		byte cv = (byte) v;
@@ -124,23 +124,101 @@ public class MapToByte extends AMapToData {
 		}
 	}
 
+	private final void preAggregateDenseToRowNoFlip(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+	}
+
+	private final void preAggregateDenseToRowWithFlip(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data[rc] & 0xFF] += mV[off];
+	}
+
+	private static final void preAggregateDenseToRowBy8WithFlip(final double[] mV, int off, final double[] preAV,
+		final int cl, final int cu, final byte[] data) {
+		final int h = (cu - cl) % 8;
+		off += cl;
+		for(int rc = cl; rc < cl + h; rc++, off++)
+			preAV[data[rc] & 0xFF] += mV[off];
+		for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
+			int id1 = data[rc] & 0xFF, id2 = data[rc + 1] & 0xFF, id3 = data[rc + 2] & 0xFF, id4 = data[rc + 3] & 0xFF,
+				id5 = data[rc + 4] & 0xFF, id6 = data[rc + 5] & 0xFF, id7 = data[rc + 6] & 0xFF, id8 = data[rc + 7] & 0xFF;
+			preAV[id1] += mV[off];
+			preAV[id2] += mV[off + 1];
+			preAV[id3] += mV[off + 2];
+			preAV[id4] += mV[off + 3];
+			preAV[id5] += mV[off + 4];
+			preAV[id6] += mV[off + 5];
+			preAV[id7] += mV[off + 6];
+			preAV[id8] += mV[off + 7];
+		}
+	}
+
+	private static final void preAggregateDenseToRowBy8NoFlip(final double[] mV, int off, final double[] preAV,
+		final int cl, final int cu, final byte[] data) {
+		final int h = (cu - cl) % 8;
+		off += cl;
+		for(int rc = cl; rc < cl + h; rc++, off++)
+			preAV[data[rc]] += mV[off];
+		for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
+			int id1 = data[rc], id2 = data[rc + 1], id3 = data[rc + 2], id4 = data[rc + 3], id5 = data[rc + 4],
+				id6 = data[rc + 5], id7 = data[rc + 6], id8 = data[rc + 7];
+			preAV[id1] += mV[off];
+			preAV[id2] += mV[off + 1];
+			preAV[id3] += mV[off + 2];
+			preAV[id4] += mV[off + 3];
+			preAV[id5] += mV[off + 4];
+			preAV[id6] += mV[off + 5];
+			preAV[id7] += mV[off + 6];
+			preAV[id8] += mV[off + 7];
+		}
+	}
+
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc] & 0xFF;
-					preAV[offOut + idx] += mV[offLeft + rc];
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(getUnique() < 127) {
+			if(cu - cl > 64)
+				preAggregateDenseToRowBy8NoFlip(mV, off, preAV, cl, cu, _data);
+			else
+				preAggregateDenseToRowNoFlip(mV, off, preAV, cl, cu);
+		}
+		else if(cu - cl > 64)
+			// Have tried with 4 and 16, but 8 is empirically best
+			preAggregateDenseToRowBy8WithFlip(mV, off, preAV, cl, cu, _data);
+		else
+			preAggregateDenseToRowWithFlip(mV, off, preAV, cl, cu);
+	}
+
+	@Override
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final int nVal = getUnique();
+		final DenseBlock db = m.getDenseBlock();
+		if(db.isContiguous()) {
+			final double[] mV = m.getDenseBlockValues();
+			final int nCol = m.getNumColumns();
+			for(int c = cl; c < cu; c++) {
+				final int idx = getIndex(c);
+				final int start = c + nCol * rl;
+				final int end = c + nCol * ru;
+				for(int offOut = idx, off = start; off < end; offOut += nVal, off += nCol) {
+					preAV[offOut] += mV[off];
 				}
 			}
 		}
+		else
+			throw new NotImplementedException();
+	}
+
+	@Override
+	public final void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
index d1fc012..64130bd 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
@@ -24,7 +24,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -35,7 +39,7 @@ public class MapToChar extends AMapToData {
 	private final char[] _data;
 
 	public MapToChar(int unique, int size) {
-		super(unique);
+		super(Math.min(unique, Character.MAX_VALUE + 1));
 		_data = new char[size];
 	}
 
@@ -59,7 +63,7 @@ public class MapToChar extends AMapToData {
 		return getInMemorySize(_data.length);
 	}
 
-	public static long getInMemorySize(int dataLength) {
+	protected static long getInMemorySize(int dataLength) {
 		long size = 16 + 8; // object header + object reference
 		size += MemoryEstimates.charArrayCost(dataLength);
 		return size;
@@ -98,7 +102,7 @@ public class MapToChar extends AMapToData {
 			out.writeChar(_data[i]);
 	}
 
-	public static MapToChar readFields(DataInput in) throws IOException {
+	protected static MapToChar readFields(DataInput in) throws IOException {
 		int unique = in.readInt();
 		final int length = in.readInt();
 		final char[] data = new char[length];
@@ -107,27 +111,68 @@ public class MapToChar extends AMapToData {
 		return new MapToChar(unique, data);
 	}
 
-	public char[] getChars() {
+	protected char[] getChars() {
 		return _data;
 	}
 
+	private void preAggregateDenseToRowBy8(double[] mV, double[] preAV, int cl, int cu, int off) {
+		final int h = (cu - cl) % 8;
+		off += cl;
+		for(int rc = cl; rc < cl + h; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+		for(int rc = cl + h; rc < cu; rc += 8, off += 8) {
+			int id1 = _data[rc], id2 = _data[rc + 1], id3 = _data[rc + 2], id4 = _data[rc + 3], id5 = _data[rc + 4],
+				id6 = _data[rc + 5], id7 = _data[rc + 6], id8 = _data[rc + 7];
+			preAV[id1] += mV[off];
+			preAV[id2] += mV[off + 1];
+			preAV[id3] += mV[off + 2];
+			preAV[id4] += mV[off + 3];
+			preAV[id5] += mV[off + 4];
+			preAV[id6] += mV[off + 5];
+			preAV[id7] += mV[off + 6];
+			preAV[id8] += mV[off + 7];
+		}
+	}
+
+	@Override
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		if(cu - cl > 1000)
+			preAggregateDenseToRowBy8(mV, preAV, cl, cu, off);
+		else {
+			off += cl;
+			for(int rc = cl; rc < cu; rc++, off++)
+				preAV[_data[rc]] += mV[off];
+		}
+	}
+
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final int nVal = getUnique();
+		final DenseBlock db = m.getDenseBlock();
+		if(db.isContiguous()) {
+			final double[] mV = m.getDenseBlockValues();
+			final int nCol = m.getNumColumns();
+			for(int c = cl; c < cu; c++) {
+				final int idx = getIndex(c);
+				final int start = c + nCol * rl;
+				final int end = c + nCol * ru;
+				for(int offOut = idx, off = start; off < end; offOut += nVal, off += nCol) {
+					preAV[offOut] += mV[off];
 				}
 			}
 		}
+		else
+			throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		indexes.preAggregateDenseMap(m, preAV, rl, ru, cl, cu, getUnique(), _data);
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		indexes.preAggregateSparseMap(sb, preAV, rl, ru, getUnique(), _data);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
index 8a70688..1915cb8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
@@ -27,7 +27,7 @@ import org.apache.sysds.runtime.compress.bitmap.ABitmap;
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
 public class MapToFactory {
-	// private static final Log LOG = LogFactory.getLog(MapToFactory.class.getName());
+	// protected static final Log LOG = LogFactory.getLog(MapToFactory.class.getName());
 
 	public enum MAP_TYPE {
 		BIT, BYTE, CHAR, INT;
@@ -41,6 +41,7 @@ public class MapToFactory {
 
 	public static AMapToData create(int size, boolean zeros, IntArrayList[] values) {
 		AMapToData _data = MapToFactory.create(size, values.length + (zeros ? 1 : 0));
+
 		if(zeros)
 			_data.fill(values.length);
 
@@ -53,12 +54,19 @@ public class MapToFactory {
 		return _data;
 	}
 
+	/**
+	 * Create and allocate a map with the given size and support for upto the num tuples argument of values
+	 * 
+	 * @param size      The number of cells to allocate
+	 * @param numTuples The maximum value to be able to represent inside the map.
+	 * @return A new map
+	 */
 	public static AMapToData create(int size, int numTuples) {
-		if(numTuples <= 1)
+		if(numTuples <= 2)
 			return new MapToBit(numTuples, size);
-		else if(numTuples < 256)
+		else if(numTuples <= 256)
 			return new MapToByte(numTuples, size);
-		else if(numTuples <= (int) Character.MAX_VALUE)
+		else if(numTuples <= ((int) Character.MAX_VALUE) + 1)
 			return new MapToChar(numTuples, size);
 		else
 			return new MapToInt(numTuples, size);
@@ -79,15 +87,15 @@ public class MapToFactory {
 		AMapToData ret;
 		if(d instanceof MapToBit)
 			return d;
-		else if(numTuples <= 1)
+		else if(numTuples <= 2)
 			ret = new MapToBit(numTuples, size);
 		else if(d instanceof MapToByte)
 			return d;
-		else if(numTuples < 256)
+		else if(numTuples <= 256)
 			ret = new MapToByte(numTuples, size);
 		else if(d instanceof MapToChar)
 			return d;
-		else if(numTuples <= (int) Character.MAX_VALUE)
+		else if(numTuples <= (int) Character.MAX_VALUE + 1)
 			ret = new MapToChar(numTuples, size);
 		else // then the input was int and reshapes to int
 			return d;
@@ -123,17 +131,16 @@ public class MapToFactory {
 				ret = new MapToInt(numTuples, size);
 				break;
 		}
-
 		ret.copy(d);
 		return ret;
 	}
 
 	public static long estimateInMemorySize(int size, int numTuples) {
-		if(numTuples <= 1)
+		if(numTuples <= 2)
 			return MapToBit.getInMemorySize(size);
-		else if(numTuples < 256)
+		else if(numTuples <= 256)
 			return MapToByte.getInMemorySize(size);
-		else if(numTuples <= (int) Character.MAX_VALUE)
+		else if(numTuples <= ((int) Character.MAX_VALUE) + 1)
 			return MapToChar.getInMemorySize(size);
 		else
 			return MapToInt.getInMemorySize(size);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
index b991ccb..9712d11 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
@@ -24,7 +24,11 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory.MAP_TYPE;
+import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -59,7 +63,7 @@ public class MapToInt extends AMapToData {
 		return getInMemorySize(_data.length);
 	}
 
-	public static long getInMemorySize(int dataLength) {
+	protected static long getInMemorySize(int dataLength) {
 		long size = 16 + 8; // object header + object reference
 		size += MemoryEstimates.intArrayCost(dataLength);
 		return size;
@@ -96,7 +100,7 @@ public class MapToInt extends AMapToData {
 			out.writeInt(_data[i]);
 	}
 
-	public static MapToInt readFields(DataInput in) throws IOException {
+	protected static MapToInt readFields(DataInput in) throws IOException {
 		int unique = in.readInt();
 		final int length = in.readInt();
 		final int[] data = new int[length];
@@ -106,22 +110,41 @@ public class MapToInt extends AMapToData {
 	}
 
 	@Override
-	public void preAggregateDense(MatrixBlock m, MatrixBlock pre, int rl, int ru, int cl, int cu) {
-		final int nRow = m.getNumColumns();
-		final int nVal = pre.getNumColumns();
-		final double[] preAV = pre.getDenseBlockValues();
-		final double[] mV = m.getDenseBlockValues();
-		final int blockSize = 4000;
-		for(int block = cl; block < cu; block += blockSize) {
-			final int blockEnd = Math.min(block + blockSize, nRow);
-			for(int rowLeft = rl, offOut = 0; rowLeft < ru; rowLeft++, offOut += nVal) {
-				final int offLeft = rowLeft * nRow;
-				for(int rc = block; rc < blockEnd; rc++) {
-					final int idx = _data[rc];
-					preAV[offOut + idx] += mV[offLeft + rc];
+	protected void preAggregateDenseToRow(double[] mV, int off, double[] preAV, int cl, int cu) {
+		off += cl;
+		for(int rc = cl; rc < cu; rc++, off++)
+			preAV[_data[rc]] += mV[off];
+	}
+
+	@Override
+	protected void preAggregateDenseRows(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu) {
+		final int nVal = getUnique();
+		final DenseBlock db = m.getDenseBlock();
+		if(db.isContiguous()) {
+			final double[] mV = m.getDenseBlockValues();
+			final int nCol = m.getNumColumns();
+			for(int c = cl; c < cu; c++) {
+				final int idx = getIndex(c);
+				final int start = c + nCol * rl;
+				final int end = c + nCol * ru;
+				for(int offOut = idx, off = start; off < end; offOut += nVal, off += nCol) {
+					preAV[offOut] += mV[off];
 				}
 			}
 		}
+		else
+			throw new NotImplementedException();
+
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, AOffset indexes) {
+		throw new NotImplementedException();
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAV, int rl, int ru, AOffset indexes) {
+		throw new NotImplementedException();
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
index 17a5026..1c7e81e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AIterator.java
@@ -50,13 +50,6 @@ public abstract class AIterator {
 	public abstract void next();
 
 	/**
-	 * Get a boolean specifying if the iterator is done
-	 * 
-	 * @return A boolean that is true if there are more values contained in the Iterator.
-	 */
-	public abstract boolean hasNext();
-
-	/**
 	 * Get the current index value, note this correspond to a row index in the original matrix.
 	 * 
 	 * @return The current value pointed at.
@@ -66,26 +59,39 @@ public abstract class AIterator {
 	}
 
 	/**
-	 * Get the current index value and increment the pointers
+	 * find out if the current offset is not exceeding the index.
 	 * 
-	 * @return The current value pointed at.
+	 * @param ub The offset to not exceed
+	 * @return boolean if it is exceeded.
 	 */
-	public int valueAndIncrement() {
-		int x = offset;
-		next();
-		return x;
+	public boolean isNotOver(int ub) {
+		return offset < ub;
 	}
 
 	/**
 	 * Get the current data index associated with the index returned from value.
 	 * 
-	 * @return The data Index.
+	 * This index points to a position int the mapToData object, that then inturn can be used to lookup the dictionary
+	 * entry in ADictionary.
+	 * 
+	 * @return The Data Index.
 	 */
 	public int getDataIndex() {
 		return dataIndex;
 	}
 
 	/**
+	 * Get the current offsets index, that points to the underlying offsets list.
+	 * 
+	 * This is available for debugging purposes, not to be used for the calling classes.
+	 * 
+	 * @return The Offsets Index.
+	 */
+	public int getOffsetsIndex() {
+		return index;
+	}
+
+	/**
 	 * Get the current data index and increment the pointers using the next operator.
 	 * 
 	 * @return The current data index.
@@ -99,17 +105,23 @@ public abstract class AIterator {
 	/**
 	 * Skip values until index is achieved.
 	 * 
-	 * @param index The index to skip to.
+	 * @param idx The index to skip to.
 	 * @return the index that follows or are equal to the skip to index.
 	 */
-	public int skipTo(int index) {
-		while(hasNext() && offset < index)
-			next();
-		return offset;
-	}
+	public abstract int skipTo(int idx);
 
 	/**
 	 * Copy the iterator with the current values.
 	 */
 	public abstract AIterator clone();
+
+	/**
+	 * Unsafe version of equals, note that it should only compare iterators stemming from the same Offset Object.
+	 * 
+	 * @param o The Iterator to compare
+	 * @return The result
+	 */
+	public boolean equals(AIterator o) {
+		return o.index == this.index;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
index 2781600..a884e4d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java
@@ -21,12 +21,14 @@ package org.apache.sysds.runtime.compress.colgroup.offset;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
-import java.lang.ref.SoftReference;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.BitSet;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 /**
  * Offset list encoder interface.
@@ -39,9 +41,14 @@ import org.apache.commons.logging.LogFactory;
  */
 public abstract class AOffset implements Serializable {
 
-	private static final long serialVersionUID = -4143271285905723425L;
 	protected static final Log LOG = LogFactory.getLog(AOffset.class.getName());
-	protected SoftReference<Map<Integer, AIterator>> skipIterators;
+
+	private ThreadLocal<OffsetCache> cacheRow = new ThreadLocal<OffsetCache>() {
+		@Override
+		protected OffsetCache initialValue() {
+			return null;
+		}
+	};
 
 	/**
 	 * Get an iterator of the offsets.
@@ -57,16 +64,30 @@ public abstract class AOffset implements Serializable {
 	 * @return AIterator that iterate through index and dictionary offset values.
 	 */
 	public AIterator getIterator(int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			AIterator it = sk.getOrDefault(row, null);
-			if(it != null)
-				return it.clone();
+		if(row <= getOffsetToFirst())
+			return getIterator();
+		else if(row > getOffsetToLast())
+			return null;
+
+		// try the cache first.
+		OffsetCache c = cacheRow.get();
+		if(c == null) {
+			AIterator it = getIterator();
+			it.skipTo(row);
+			cacheIterator(it.clone(), row);
+			return it;
+		}
+		else if(c.row == row)
+			return c.it.clone();
+		else {
+			// Use the cached iterator if it is closer to the queried row.
+			AIterator it = c.row < row ? c.it.clone() : getIterator();
+			it.skipTo(row);
+			// cache this new iterator.
+			cacheIterator(it.clone(), row);
+			return it;
 		}
-		AIterator it = getIterator();
-		it.skipTo(row);
-		cacheIterator(it.clone(), row);
-		return it;
+
 	}
 
 	/**
@@ -76,15 +97,9 @@ public abstract class AOffset implements Serializable {
 	 * @param row The row index to cache the iterator as.
 	 */
 	public void cacheIterator(AIterator it, int row) {
-		if(skipIterators != null) {
-			Map<Integer, AIterator> sk = skipIterators.get();
-			sk.put(row, it);
-		}
-		else {
-			Map<Integer, AIterator> nsk = new HashMap<>();
-			nsk.put(row, it.clone());
-			skipIterators = new SoftReference<>(nsk);
-		}
+		if(it == null)
+			return;
+		cacheRow.set(new OffsetCache(it, row));
 	}
 
 	/**
@@ -99,6 +114,20 @@ public abstract class AOffset implements Serializable {
 	public abstract void write(DataOutput out) throws IOException;
 
 	/**
+	 * Get the offset to the first index
+	 * 
+	 * @return The first index offset
+	 */
+	public abstract int getOffsetToFirst();
+
+	/**
+	 * Get the offset to the last value
+	 * 
+	 * @return The last values offset
+	 */
+	public abstract int getOffsetToLast();
+
+	/**
 	 * Get the in memory size of the Offset object
 	 * 
 	 * @return In memory size as a long.
@@ -119,17 +148,259 @@ public abstract class AOffset implements Serializable {
 	 */
 	public abstract int getSize();
 
+	/**
+	 * Get the length of the underlying offsets lists.
+	 * 
+	 * @return The number of offsets.
+	 */
+	public abstract int getOffsetsLength();
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRowChar(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRowsChar(db, preAV, rl, ru, cl, cu, nVal, data, it);
+		}
+	}
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRowByte(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRowsByte(db, preAV, rl, ru, cl, cu, nVal, data, it);
+		}
+	}
+
+	public final void preAggregateDenseMap(MatrixBlock m, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		BitSet data) {
+		// multi row iterator.
+		final AIterator it = getIterator(cl);
+		if(it == null)
+			return;
+		else if(it.offset > cu)
+			cacheIterator(it, cu); // cache this iterator.
+		else if(rl == ru - 1) {
+			final DenseBlock db = m.getDenseBlock();
+			final double[] mV = db.values(rl);
+			final int off = db.pos(rl);
+			preAggregateDenseMapRowBit(mV, off, preAV, cu, nVal, data, it);
+		}
+		else {
+			final DenseBlock db = m.getDenseBlock();
+			preAggregateDenseMapRowsBit(db, preAV, rl, ru, cl, cu, nVal, data, it);
+		}
+	}
+
+	protected abstract void preAggregateDenseMapRowByte(double[] mV, int off, double[] preAV, int cu, int nVal,
+		byte[] data, AIterator it);
+
+	protected abstract void preAggregateDenseMapRowChar(double[] mV, int off, double[] preAV, int cu, int nVal,
+		char[] data, AIterator it);
+
+	protected abstract void preAggregateDenseMapRowBit(double[] mV, int off, double[] preAV, int cu, int nVal,
+		BitSet data, AIterator it);
+
+	protected abstract void preAggregateDenseMapRowsChar(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu,
+		int nVal, char[] data, AIterator it);
+
+	protected abstract void preAggregateDenseMapRowsByte(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu,
+		int nVal, byte[] data, AIterator it);
+
+	protected void preAggregateDenseMapRowsBit(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		BitSet data, AIterator it) {
+		if(cu < getOffsetToLast() + 1)
+			preAggregateDenseMapRowsBitBelowEnd(db, preAV, rl, ru, cl, cu, nVal, data, it);
+		else
+			preAggregateDenseMapRowsBitEnd(db, preAV, rl, ru, cl, cu, nVal, data, it);
+	}
+
+	protected void preAggregateDenseMapRowsBitBelowEnd(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu,
+		int nVal, BitSet data, AIterator it) {
+		final double[] vals = db.values(rl);
+		final int nCol = db.getCumODims(0);
+		while(it.offset < cu) {
+			final int dataOffset = data.get(it.dataIndex) ? 1 : 0;
+			final int start = it.offset + nCol * rl;
+			final int end = it.offset + nCol * ru;
+			for(int offOut = dataOffset, off = start; off < end; offOut += nVal, off += nCol)
+				preAV[offOut] += vals[off];
+			it.next();
+		}
+
+		cacheIterator(it, cu);
+	}
+
+	protected void preAggregateDenseMapRowsBitEnd(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu,
+		int nVal, BitSet data, AIterator it) {
+		final double[] vals = db.values(rl);
+		final int nCol = db.getCumODims(0);
+		final int last = getOffsetToLast();
+		int dataOffset = data.get(it.dataIndex) ? 1 : 0;
+		int start = it.offset + nCol * rl;
+		int end = it.offset + nCol * ru;
+		for(int offOut = dataOffset, off = start; off < end; offOut += nVal, off += nCol)
+			preAV[offOut] += vals[off];
+		while(it.offset < last) {
+			it.next();
+			dataOffset = data.get(it.dataIndex) ? 1 : 0;
+			start = it.offset + nCol * rl;
+			end = it.offset + nCol * ru;
+			for(int offOut = dataOffset, off = start; off < end; offOut += nVal, off += nCol)
+				preAV[offOut] += vals[off];
+		}
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, char[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, byte[] data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	public final void preAggregateSparseMap(SparseBlock sb, double[] preAV, int rl, int ru, int nVal, BitSet data) {
+		final AIterator it = getIterator();
+		if(rl == ru - 1)
+			preAggregateSparseMapRow(sb, preAV, rl, nVal, data, it);
+		else
+			throw new NotImplementedException("MultiRow Preaggregation not supported yet");
+	}
+
+	private void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, byte[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+
+		int j = apos;
+		while(j < alen) {
+			if(aix[j] == it.offset) {
+				preAV[data[it.dataIndex] & 0xFF] += avals[j++];
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(aix[j] < it.offset) {
+				j++;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
+	private void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, char[] data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+
+		final int maxId = data.length - 1;
+		int j = apos;
+		while(j < alen) {
+			if(aix[j] == it.offset) {
+				preAV[data[it.dataIndex]] += avals[j++];
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+			else if(aix[j] < it.offset) {
+				j++;
+			}
+			else {
+				if(it.dataIndex >= maxId)
+					break;
+				it.next();
+			}
+		}
+	}
+
+	private void preAggregateSparseMapRow(SparseBlock sb, double[] preAV, int r, int nVal, BitSet data, AIterator it) {
+		final int apos = sb.pos(r);
+		final int alen = sb.size(r) + apos;
+		final int[] aix = sb.indexes(r);
+		final double[] avals = sb.values(r);
+		final int last = getOffsetToLast();
+
+		int j = apos;
+		while(it.offset < last && j < alen) {
+			if(aix[j] == it.offset) {
+				preAV[data.get(it.dataIndex) ? 1 : 0] += avals[j++];
+				it.next();
+			}
+			if(j < alen)
+				while(it.offset < last && aix[j] > it.offset)
+					it.next();
+			while(j < alen && aix[j] < it.offset)
+				j++;
+		}
+		while(j < alen && aix[j] < it.offset)
+			j++;
+		if(j != alen && aix[j] == it.offset)
+			preAV[data.get(it.dataIndex) ? 1 : 0] += avals[j];
+
+	}
+
 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
-		AIterator i = getIterator();
 		sb.append(this.getClass().getSimpleName());
-		sb.append(" [");
-		sb.append(i.valueAndIncrement());
-
-		while(i.hasNext())
-			sb.append(", " + i.valueAndIncrement());
+		final AIterator it = getIterator();
+		final int last = getOffsetToLast();
+		sb.append("[");
+		while(it.offset < last) {
+			sb.append(it.offset);
+			sb.append(", ");
+			it.next();
+		}
+		sb.append(it.offset);
 		sb.append("]");
 		return sb.toString();
 	}
+
+	protected static class OffsetCache {
+		protected final AIterator it;
+		protected final int row;
+
+		protected OffsetCache(AIterator it, int row) {
+			this.it = it;
+			this.row = row;
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
index 29133cb..e09c20b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetByte.java
@@ -21,18 +21,20 @@ package org.apache.sysds.runtime.compress.colgroup.offset;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
+import java.util.BitSet;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetByte extends AOffset {
 
 	private static final long serialVersionUID = -4716104973912491790L;
+	private static final int maxV = 255;
 
-	private final static int maxV = 255;
 	private final byte[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
+	private final boolean noOverHalf;
 
 	public OffsetByte(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -41,21 +43,22 @@ public class OffsetByte extends AOffset {
 	public OffsetByte(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
+		// find the size of the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new byte[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
+		// populate the array
 		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = nv - ov;
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -69,11 +72,30 @@ public class OffsetByte extends AOffset {
 
 			ov = nv;
 		}
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		this.noOverHalf = noOverHalf;
 	}
 
-	private OffsetByte(byte[] offsets, int offsetToFirst) {
+	protected OffsetByte(byte[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
+		this.noOverHalf = getNoOverHalf();
+	}
+
+	private boolean getNoOverHalf() {
+		boolean noOverHalf = true;
+		for(byte b : offsets)
+			if(b < 0) {
+				noOverHalf = false;
+				break;
+			}
+		return noOverHalf;
 	}
 
 	@Override
@@ -92,7 +114,9 @@ public class OffsetByte extends AOffset {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 4 + 8; // object header plus ints plus reference
+		size += MemoryEstimates.byteArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -103,27 +127,468 @@ public class OffsetByte extends AOffset {
 	@Override
 	public int getSize() {
 		int size = 1;
-		for(byte b : offsets) {
+		for(byte b : offsets)
 			if(b != 0)
 				size++;
-		}
+
 		return size;
 	}
 
-	public static long getInMemorySize(int length) {
-		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.byteArrayCost(length);
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
+	public static long estimateInMemorySize(int nOffs, int nRows) {
+		long size = 16 + 4 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.byteArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
 	public static OffsetByte readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		byte[] offsets = new byte[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+
+		final byte[] offsets = new byte[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readByte();
+			offsetToLast += offsets[i] & 0xFF;
+		}
+		return new OffsetByte(offsets, offsetToFirst, offsetToLast);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRowByte(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			final boolean nvalHalf = nVal < 127;
+			if(noOverHalf && noZero && nvalHalf)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(mV, off, preAV, cu, data, itb);
+			else if(noOverHalf && noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseByteMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else if(nvalHalf)
+				preAggregateDenseByteMapRowBelowEndDataHalf(mV, off, preAV, cu, data, itb);
+			else if(noOverHalf)
+				preAggregateDenseByteMapRowBelowEndNoOverHalf(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseByteMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseByteMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseByteMapRow(mV, off, preAV, data, itb);
+
+	}
+
+	private final void preAggregateDenseByteMapRow(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+		final int maxId = data.length - 1;
+
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex] & 0xFF] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex] & 0xFF] += mV[offset];
+		}
+	}
+
+	private final void preAggregateDenseByteMapRowNoZero(double[] mV, int off, double[] preAV, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		while(index < offsets.length) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		// process straggler index.
+		preAV[data[index] & 0xFF] += mV[offset];
+	}
+
+	private void preAggregateDenseByteMapRowBelowEndNoOverHalf(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex] & 0xFF] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseByteMapRowBelowEndDataHalf(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex]] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseByteMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex] & 0xFF] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseByteMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, byte[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index] & 0xFF] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseByteMapRowBelowEndAndNoZeroNoOverHalfAlsoData(double[] mV, int off,
+		double[] preAV, int cu, byte[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRowChar(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		final boolean noZero = offsets.length == data.length - 1;
+		if(cu < offsetToLast + 1) {
+			if(noOverHalf && noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(mV, off, preAV, cu, data, itb);
+			else if(noZero)
+				preAggregateDenseCharMapRowBelowEndAndNoZero(mV, off, preAV, cu, data, itb);
+			else
+				preAggregateDenseCharMapRowBelowEnd(mV, off, preAV, cu, data, itb);
+			cacheIterator(itb, cu);
+		}
+		else if(noZero)
+			preAggregateDenseCharMapRowNoZero(mV, off, preAV, data, itb);
+		else
+			preAggregateDenseCharMapRow(mV, off, preAV, data, itb);
+	}
+
+	private void preAggregateDenseCharMapRow(double[] mV, int off, double[] preAV, char[] data, IterateByteOffset it) {
+		final int maxId = data.length - 1;
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		preAV[data[dataIndex]] += mV[offset];
+		while(dataIndex < maxId) {
+			byte v = offsets[index];
+			while(v == 0) {
+				offset += maxV;
+				index++;
+				v = offsets[index];
+			}
+			offset += v & 0xff;
+			index++;
+			dataIndex++;
+			preAV[data[dataIndex]] += mV[offset];
+		}
+	}
+
+	private void preAggregateDenseCharMapRowNoZero(double[] mV, int off, double[] preAV, char[] data,
+		IterateByteOffset it) {
+
+		int offset = it.offset + off;
+		int index = it.index;
+		while(index < offsets.length) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+		preAV[data[index]] += mV[offset];
+	}
+
+	private void preAggregateDenseCharMapRowBelowEnd(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+
+		cu += off;
+		it.offset += off;
+		while(it.offset < cu) {
+			preAV[data[it.dataIndex]] += mV[it.offset];
+			byte v = offsets[it.index];
+			while(v == 0) {
+				it.offset += maxV;
+				it.index++;
+				v = offsets[it.index];
+			}
+			it.offset += v & 0xFF;
+			it.index++;
+			it.dataIndex++;
+		}
+		it.offset -= off;
+	}
+
+	private void preAggregateDenseCharMapRowBelowEndAndNoZero(double[] mV, int off, double[] preAV, int cu, char[] data,
+		IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++] & 0xFF;
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	private final void preAggregateDenseCharMapRowBelowEndAndNoZeroNoOverHalf(double[] mV, int off, double[] preAV,
+		int cu, char[] data, IterateByteOffset it) {
+		int offset = it.offset + off;
+		int index = it.index;
+
+		cu += off;
+
+		while(offset < cu) {
+			preAV[data[index]] += mV[offset];
+			offset += offsets[index++];
+		}
+
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRowBit(double[] mV, int off, double[] preAV, int cu, int nVal, BitSet data,
+		AIterator it) {
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		if(cu > offsetToLast) {
+			final int last = offsetToLast + off;
+
+			while(offset < last) {
+				preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+				byte v = offsets[index];
+				while(v == 0) {
+					offset += maxV;
+					index++;
+					v = offsets[index];
+				}
+				offset += v & 0xFF;
+				index++;
+				dataIndex++;
+			}
+			preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+		}
+		else {
+			final int last = cu + off;
+			while(offset < last) {
+				preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+				byte v = offsets[index];
+				while(v == 0) {
+					offset += maxV;
+					index++;
+					v = offsets[index];
+				}
+				offset += v & 0xFF;
+				index++;
+				dataIndex++;
+			}
+
+		}
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected void preAggregateDenseMapRowsByte(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data, AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		if(cu < getOffsetToLast() + 1)
+			preAggregateDenseMapRowsByteBelowEnd(db, preAV, rl, ru, cl, cu, nVal, data, itb);
+		else
+			preAggregateDenseMapRowsByteEnd(db, preAV, rl, ru, cl, cu, nVal, data, itb);
+	}
+
+	private void preAggregateDenseMapRowsByteBelowEnd(DenseBlock db, final double[] preAV, final int rl, final int ru,
+		final int cl, final int cu, final int nVal, byte[] data, IterateByteOffset it) {
+		final double[] vals = db.values(rl);
+		final int nCol = db.getCumODims(0);
+		while(it.offset < cu) {
+			final int dataOffset = data[it.dataIndex] & 0xFF;
+			final int start = it.offset + nCol * rl;
+			final int end = it.offset + nCol * ru;
+			for(int offOut = dataOffset, off = start; off < end; offOut += nVal, off += nCol)
+				preAV[offOut] += vals[off];
+			it.next();
+		}
+
+		cacheIterator(it, cu);
+	}
+
+	private void preAggregateDenseMapRowsByteEnd(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data, IterateByteOffset it) {
+		final int maxId = data.length - 1;
+		final int offsetStart = it.offset;
+		final int indexStart = it.index;
+		final int dataIndexStart = it.dataIndex;
+		// all the way to the end of offsets.
+		for(int r = rl; r < ru; r++) {
+			final int offOut = (r - rl) * nVal;
+			final int off = db.pos(r);
+			final double[] vals = db.values(r);
+			it.offset = offsetStart + off;
+			it.index = indexStart;
+			it.dataIndex = dataIndexStart;
+			preAV[offOut + data[it.dataIndex] & 0xFF] += vals[it.offset];
+			while(it.dataIndex < maxId) {
+				it.next();
+				preAV[offOut + data[it.dataIndex] & 0xFF] += vals[it.offset];
+			}
+		}
+	}
+
+	@Override
+	protected void preAggregateDenseMapRowsChar(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data, AIterator it) {
+		IterateByteOffset itb = (IterateByteOffset) it;
+		if(cu < getOffsetToLast() + 1)
+			preAggregateDenseMapRowsCharBelowEnd(db, preAV, rl, ru, cl, cu, nVal, data, itb);
+		else
+			preAggregateDenseMapRowsCharEnd(db, preAV, rl, ru, cl, cu, nVal, data, itb);
+
+	}
+
+	private void preAggregateDenseMapRowsCharBelowEnd(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu,
+		int nVal, char[] data, IterateByteOffset it) {
+		final double[] vals = db.values(rl);
+		while(it.offset < cu) {
+			final int dataOffset = data[it.dataIndex];
+			for(int r = rl, offOut = dataOffset; r < ru; r++, offOut += nVal)
+				preAV[offOut] += vals[it.offset + db.pos(r)];
+			it.next();
+		}
+		cacheIterator(it, cu);
+	}
+
+	private void preAggregateDenseMapRowsCharEnd(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data, IterateByteOffset it) {
+		final int maxId = data.length - 1;
+		// all the way to the end.
+		final int offsetStart = it.offset;
+		final int indexStart = it.index;
+		final int dataIndexStart = it.dataIndex;
+		for(int r = rl; r < ru; r++) {
+			final int offOut = (r - rl) * nVal;
+			final int off = db.pos(r);
+			final double[] vals = db.values(r);
+			it.offset = offsetStart + off;
+			it.index = indexStart;
+			it.dataIndex = dataIndexStart;
+			preAV[offOut + data[it.dataIndex]] += vals[it.offset];
+			while(it.dataIndex < maxId) {
+				it.next();
+				preAV[offOut + data[it.dataIndex]] += vals[it.offset];
+			}
 		}
-		return new OffsetByte(offsets, offsetToFirst);
 	}
 
 	private class IterateByteOffset extends AIterator {
@@ -138,26 +603,22 @@ public class OffsetByte extends AOffset {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-
-			final byte v = offsets[index++];
-			if(v == 0) {
+			byte v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v & 0xFF;
+				index++;
+				v = offsets[index];
 			}
+			offset += v & 0xFF;
+			index++;
+			dataIndex++;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,4 +626,5 @@ public class OffsetByte extends AOffset {
 			return new IterateByteOffset(index, dataIndex, offset);
 		}
 	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
index c1c2930..695d6c5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetChar.java
@@ -21,19 +21,19 @@ package org.apache.sysds.runtime.compress.colgroup.offset;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Arrays;
+import java.util.BitSet;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.utils.MemoryEstimates;
 
 public class OffsetChar extends AOffset {
 
 	private static final long serialVersionUID = -1192266421395964882L;
-
-	private final static int maxV = (int) Character.MAX_VALUE;
+	private static final int maxV = (int) Character.MAX_VALUE;
 
 	private final char[] offsets;
 	private final int offsetToFirst;
+	private final int offsetToLast;
 
 	public OffsetChar(int[] indexes) {
 		this(indexes, 0, indexes.length);
@@ -42,21 +42,20 @@ public class OffsetChar extends AOffset {
 	public OffsetChar(int[] indexes, int apos, int alen) {
 		int endSize = 0;
 		offsetToFirst = indexes[apos];
+		offsetToLast = indexes[alen - 1];
 		int ov = offsetToFirst;
-		for(int i = apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
-			endSize += 1 + (nv - ov) / maxV;
+			endSize += 1 + (nv - ov - 1) / maxV;
 			ov = nv;
 		}
 		offsets = new char[endSize];
 		ov = offsetToFirst;
 		int p = 0;
 
-		for(int i =  apos+1; i < alen; i++) {
+		for(int i = apos + 1; i < alen; i++) {
 			final int nv = indexes[i];
 			final int offsetSize = (nv - ov);
-			if(offsetSize == 0)
-				throw new DMLCompressionException("Invalid difference between cells :\n" + Arrays.toString(indexes));
 			final int div = offsetSize / maxV;
 			final int mod = offsetSize % maxV;
 			if(mod == 0) {
@@ -72,9 +71,10 @@ public class OffsetChar extends AOffset {
 		}
 	}
 
-	private OffsetChar(char[] offsets, int offsetToFirst) {
+	private OffsetChar(char[] offsets, int offsetToFirst, int offsetToLast) {
 		this.offsets = offsets;
 		this.offsetToFirst = offsetToFirst;
+		this.offsetToLast = offsetToLast;
 	}
 
 	@Override
@@ -93,7 +93,9 @@ public class OffsetChar extends AOffset {
 
 	@Override
 	public long getInMemorySize() {
-		return getInMemorySize(offsets.length);
+		long size = 16 + 4 + 8; // object header plus int plus reference
+		size += MemoryEstimates.charArrayCost(offsets.length);
+		return size;
 	}
 
 	@Override
@@ -111,22 +113,204 @@ public class OffsetChar extends AOffset {
 		return size;
 	}
 
+	@Override
+	public int getOffsetToFirst() {
+		return offsetToFirst;
+	}
+
+	@Override
+	public int getOffsetToLast() {
+		return offsetToLast;
+	}
+
+	@Override
+	public int getOffsetsLength() {
+		return offsets.length;
+	}
+
 	public static OffsetChar readFields(DataInput in) throws IOException {
-		int offsetToFirst = in.readInt();
-		int offsetsLength = in.readInt();
-		char[] offsets = new char[offsetsLength];
+		final int offsetToFirst = in.readInt();
+		final int offsetsLength = in.readInt();
+		final char[] offsets = new char[offsetsLength];
+		int offsetToLast = offsetToFirst;
 		for(int i = 0; i < offsetsLength; i++) {
 			offsets[i] = in.readChar();
+			offsetToLast += offsets[i];
 		}
-		return new OffsetChar(offsets, offsetToFirst);
+		return new OffsetChar(offsets, offsetToFirst, offsetToLast);
 	}
 
-	public static long getInMemorySize(int length) {
+	public static long estimateInMemorySize(int nOffs, int nRows) {
 		long size = 16 + 4 + 8; // object header plus int plus reference
-		size += MemoryEstimates.charArrayCost(length - 1);
+		size += MemoryEstimates.charArrayCost(Math.max(nOffs, nRows / maxV));
 		return size;
 	}
 
+	@Override
+	protected final void preAggregateDenseMapRowByte(double[] mV, int off, double[] preAV, int cu, int nVal, byte[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx] & 0xFF] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRowChar(double[] mV, int off, double[] preAV, int cu, int nVal, char[] data,
+		AIterator it) {
+		final int maxId = data.length - 1;
+		while(it.isNotOver(cu)) {
+			final int dx = it.getDataIndex();
+			preAV[data[dx]] += mV[off + it.value()];
+			if(dx < maxId)
+				it.next();
+			else
+				break;
+		}
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected final void preAggregateDenseMapRowBit(double[] mV, int off, double[] preAV, int cu, int nVal, BitSet data,
+		AIterator it) {
+		int offset = it.offset + off;
+		int index = it.index;
+		int dataIndex = it.dataIndex;
+
+		if(cu > offsetToLast) {
+			final int last = offsetToLast + off;
+
+			while(offset < last) {
+				preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+				char v = offsets[index];
+				while(v == 0) {
+					offset += maxV;
+					index++;
+					v = offsets[index];
+				}
+				offset += v;
+				index++;
+				dataIndex++;
+			}
+			preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+		}
+		else {
+			final int last = cu + off;
+			while(offset < last) {
+				preAV[data.get(dataIndex) ? 1 : 0] += mV[offset];
+				char v = offsets[index];
+				while(v == 0) {
+					offset += maxV;
+					index++;
+					v = offsets[index];
+				}
+				offset += v;
+				index++;
+				dataIndex++;
+			}
+
+		}
+		it.offset = offset - off;
+		it.dataIndex = index;
+		it.index = index;
+		cacheIterator(it, cu);
+	}
+
+	@Override
+	protected void preAggregateDenseMapRowsByte(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		byte[] data, AIterator it) {
+
+		final int offsetStart = it.offset;
+		final int indexStart = it.index;
+		final int dataIndexStart = it.dataIndex;
+		if(cu < getOffsetToLast() + 1) {
+			// inside offsets
+			for(int r = rl; r < ru; r++) {
+				final int offOut = (r - rl) * nVal;
+				final double[] vals = db.values(r);
+				final int off = db.pos(r);
+				final int cur = cu + off;
+				it.offset = offsetStart + off;
+				it.index = indexStart;
+				it.dataIndex = dataIndexStart;
+				while(it.offset < cur) {
+					preAV[offOut + data[it.dataIndex] & 0xFF] += vals[it.offset];
+					it.next();
+				}
+				it.offset -= off;
+			}
+			cacheIterator(it, cu);
+		}
+		else {
+			final int maxId = data.length - 1;
+			// all the way to the end of offsets.
+			for(int r = rl; r < ru; r++) {
+				final int offOut = (r - rl) * nVal;
+				final int off = db.pos(r);
+				final double[] vals = db.values(r);
+				it.offset = offsetStart + off;
+				it.index = indexStart;
+				it.dataIndex = dataIndexStart;
+				preAV[offOut + data[it.dataIndex] & 0xFF] += vals[it.offset];
+				while(it.dataIndex < maxId) {
+					it.next();
+					preAV[offOut + data[it.dataIndex] & 0xFF] += vals[it.offset];
+				}
+			}
+		}
+	}
+
+	@Override
+	protected void preAggregateDenseMapRowsChar(DenseBlock db, double[] preAV, int rl, int ru, int cl, int cu, int nVal,
+		char[] data, AIterator it) {
+
+		final int offsetStart = it.offset;
+		final int indexStart = it.index;
+		final int dataIndexStart = it.dataIndex;
+		if(cu < getOffsetToLast() + 1) {
+
+			for(int r = rl; r < ru; r++) {
+				final int offOut = (r - rl) * nVal;
+				final double[] vals = db.values(r);
+				final int off = db.pos(r);
+				final int cur = cu + off;
+				it.offset = offsetStart + off;
+				it.index = indexStart;
+				it.dataIndex = dataIndexStart;
+				while(it.offset < cur) {
+					preAV[offOut + data[it.dataIndex]] += vals[it.offset];
+					it.next();
+				}
+				it.offset -= off;
+			}
+
+			cacheIterator(it, cu);
+		}
+		else {
+			final int maxId = data.length - 1;
+			// all the way to the end.
+			for(int r = rl; r < ru; r++) {
+				final int offOut = (r - rl) * nVal;
+				final int off = db.pos(r);
+				final double[] vals = db.values(r);
+				it.offset = offsetStart + off;
+				it.index = indexStart;
+				it.dataIndex = dataIndexStart;
+				preAV[offOut + data[it.dataIndex]] += vals[it.offset];
+				while(it.dataIndex < maxId) {
+					it.next();
+					preAV[offOut + data[it.dataIndex]] += vals[it.offset];
+				}
+			}
+		}
+	}
+
 	private class IterateCharOffset extends AIterator {
 
 		private IterateCharOffset() {
@@ -139,25 +323,27 @@ public class OffsetChar extends AOffset {
 
 		@Override
 		public void next() {
-			if(index >= offsets.length) {
-				index++;
-				dataIndex++;
-				return;
-			}
-			final char v = offsets[index++];
-			if(v == 0) {
+			char v = offsets[index];
+			while(v == 0) {
 				offset += maxV;
-				next();
-			}
-			else {
-				dataIndex++;
-				offset += v;
+				index++;
+				v = offsets[index];
 			}
+			offset += v;
+			index++;
+			dataIndex++;
+		}
+
+		@Override
+		public int value() {
+			return offset;
 		}
 
 		@Override
-		public boolean hasNext() {
-			return index <= offsets.length;
+		public int skipTo(int idx) {
+			while(offset < idx && index < offsets.length)
+				next();
+			return offset;
 		}
 
 		@Override
@@ -165,5 +351,4 @@ public class OffsetChar extends AOffset {
 			return new IterateCharOffset(index, dataIndex, offset);
 		}
 	}
-
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
index d54be82..9904be3 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/OffsetFactory.java
@@ -22,12 +22,11 @@ package org.apache.sysds.runtime.compress.colgroup.offset;
 import java.io.DataInput;
 import java.io.IOException;
 
-import org.apache.sysds.runtime.compress.DMLCompressionException;
-
 public interface OffsetFactory {
 
 	// static final Log LOG = LogFactory.getLog(OffsetFactory.class.getName());
 
+	/** The specific underlying types of offsets. */
 	public enum OFF_TYPE {
 		BYTE, CHAR
 	}
@@ -35,11 +34,14 @@ public interface OffsetFactory {
 	/**
 	 * Main factory pattern creator for Offsets.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input index list only contain a sequential non
+	 * duplicate incrementing values.
+	 * 
 	 * @param indexes List of indexes, that is assumed to be sorted and have no duplicates
 	 * @return AOffset object containing offsets to the next value.
 	 */
-	public static AOffset create(int[] indexes) {
-		return create(indexes, 0, indexes.length);
+	public static AOffset createOffset(int[] indexes) {
+		return createOffset(indexes, 0, indexes.length);
 	}
 
 	/**
@@ -48,18 +50,22 @@ public interface OffsetFactory {
 	 * This is useful if the input is created from a CSR matrix, since it allows us to not reallocate the indexes[] but
 	 * use the shared indexes from the entire CSR representation.
 	 * 
+	 * Note this creator is unsafe in the sense it is assumed that the input indexes in the range from apos to alen only
+	 * contain a sequential non duplicate incrementing values.
+	 * 
 	 * @param indexes The indexes from which to take the offsets.
 	 * @param apos    The position to start looking from in the indexes.
 	 * @param alen    The position to end looking at in the indexes.
 	 * @return A new Offset.
 	 */
-	public static AOffset create(int[] indexes, int apos, int alen) {
+	public static AOffset createOffset(int[] indexes, int apos, int alen) {
+		final int minValue = indexes[apos];
 		final int maxValue = indexes[alen - 1];
-		if(maxValue < 0)
-			throw new DMLCompressionException("Invalid sizes given");
+		final int range = maxValue - minValue;
 		final int endLength = alen - apos;
-		final float avgDist = (float) maxValue / endLength;
-		if(avgDist < 256)
+		final long byteSize = OffsetByte.estimateInMemorySize(endLength, range);
+		final long charSize = OffsetChar.estimateInMemorySize(endLength, range);
+		if(byteSize < charSize)
 			return new OffsetByte(indexes, apos, alen);
 		else
 			return new OffsetChar(indexes, apos, alen);
@@ -96,16 +102,14 @@ public interface OffsetFactory {
 	 * @return The estimated size of an offset given the number of offsets and rows.
 	 */
 	public static long estimateInMemorySize(int size, int nRows) {
-		if(size < 0 || nRows < 0)
-			throw new DMLCompressionException("Invalid sizes given: " + size + "  " + nRows);
-		else if(size == 0)
+		if(size == 0)
 			return 8; // If this is the case, then the compression results in constant col groups
 		else {
 			final int avgDiff = nRows / size;
 			if(avgDiff < 256)
-				return OffsetByte.getInMemorySize(size - 1);
+				return OffsetByte.estimateInMemorySize(size - 1, nRows);
 			else
-				return OffsetChar.getInMemorySize(size - 1);
+				return OffsetChar.estimateInMemorySize(size - 1, nRows);
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
index 6ca2619..68eca80 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibAppend.java
@@ -70,6 +70,7 @@ public class CLALibAppend {
 
 		ret = appendColGroups(ret, left.getColGroups(), right.getColGroups(), left.getNumColumns());
 
+		ret.setOverlapping(left.isOverlapping() || right.isOverlapping());
 		double compressedSize = ret.getInMemorySize();
 		double uncompressedSize = MatrixBlock.estimateSizeInMemory(m, n, ret.getSparsity());
 
@@ -85,24 +86,20 @@ public class CLALibAppend {
 	}
 
 	private static MatrixBlock appendRightEmpty(CompressedMatrixBlock left, MatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(right.getNumColumns()));
 		ret = appendColGroups(ret, left.getColGroups(), newGroup, left.getNumColumns());
-
+		ret.setOverlapping(left.isOverlapping());
 		return ret;
 	}
 
 	private static MatrixBlock appendLeftEmpty(MatrixBlock left, CompressedMatrixBlock right, int m, int n) {
-
 		CompressedMatrixBlock ret = new CompressedMatrixBlock(m, n);
-
 		List<AColGroup> newGroup = new ArrayList<>(1);
 		newGroup.add(ColGroupEmpty.generate(left.getNumColumns()));
 		ret = appendColGroups(ret, newGroup, right.getColGroups(), left.getNumColumns());
-
+		ret.setOverlapping(right.isOverlapping());
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
index e4c3333..d8f582d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
@@ -70,8 +70,8 @@ public class CLALibBinaryCellOp {
 		if(that.isEmpty())
 			return binaryOperationsEmpty(op, m1, that, result);
 		that = CompressedMatrixBlock.getUncompressed(that, "Decompressing right side in BinaryOps");
-		LibMatrixBincell.isValidDimensionsBinary(m1, that);
-		BinaryAccessType atype = LibMatrixBincell.getBinaryAccessType(m1, that);
+		LibMatrixBincell.isValidDimensionsBinaryExtended(m1, that);
+		BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(m1, that);
 		return selectProcessingBasedOnAccessType(op, m1, that, result, atype, false);
 	}
 
@@ -85,8 +85,8 @@ public class CLALibBinaryCellOp {
 			throw new NotImplementedException("Not handling left empty yet");
 
 		that = CompressedMatrixBlock.getUncompressed(that, "Decompressing left side in BinaryOps");
-		LibMatrixBincell.isValidDimensionsBinary(that, m1);
-		BinaryAccessType atype = LibMatrixBincell.getBinaryAccessType(that, m1);
+		LibMatrixBincell.isValidDimensionsBinaryExtended(that, m1);
+		BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(that, m1);
 		return selectProcessingBasedOnAccessType(op, m1, that, result, atype, true);
 
 	}
@@ -114,10 +114,13 @@ public class CLALibBinaryCellOp {
 	private static MatrixBlock selectProcessingBasedOnAccessType(BinaryOperator op, CompressedMatrixBlock m1,
 		MatrixBlock that, MatrixBlock result, BinaryAccessType atype, boolean left) {
 
-		if(atype == BinaryAccessType.MATRIX_COL_VECTOR) {
+		if(atype == BinaryAccessType.MATRIX_COL_VECTOR || atype == BinaryAccessType.COL_VECTOR_MATRIX) {
 			// Column vector access
 			MatrixBlock d_compressed = m1.getCachedDecompressed();
 			if(d_compressed != null) {
+				if(left && atype == BinaryAccessType.COL_VECTOR_MATRIX)
+					throw new NotImplementedException("Binary row op left is not supported for Uncompressed Matrix, "
+						+ "Implement support for VMr in MatrixBLock Binary Cell operations");
 				if(left)
 					return that.binaryOperations(op, d_compressed);
 				else
@@ -133,7 +136,8 @@ public class CLALibBinaryCellOp {
 			else
 				return d_compressed.binaryOperations(op, that);
 		}
-		else if(isSupportedBinaryCellOp(op.fn))
+		else if(isSupportedBinaryCellOp(op.fn) && atype == BinaryAccessType.MATRIX_ROW_VECTOR ||
+			atype == BinaryAccessType.ROW_VECTOR_MATRIX)
 			// Row matrix access.
 			return rowBinCellOp(m1, that, result, op, left);
 		else
@@ -158,8 +162,8 @@ public class CLALibBinaryCellOp {
 		return ret;
 	}
 
-	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-		BinaryOperator op, boolean left) {
+	private static MatrixBlock rowBinCellOp(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op,
+		boolean left) {
 		CompressedMatrixBlock cRet = setupCompressedReturnMatrixBlock(m1, ret);
 		if(isValidForOverlappingBinaryCellOperations(m1, op))
 			overlappingBinaryCellOp(m1, m2, cRet, op, left);
@@ -178,11 +182,6 @@ public class CLALibBinaryCellOp {
 				// Verify if it is okay to include all OuterVectorVector ops here.
 				binaryMVRow(m1, m2, ret, op, left);
 				return;
-			case OUTER_VECTOR_VECTOR:
-				if(m2.getNumRows() == 1 && m2.getNumColumns() == 1) {
-					CLALibScalar.scalarOperations(new RightScalarOperator(op.fn, m2.quickGetValue(0, 0)), m1, ret);
-				}
-				return;
 			default:
 				LOG.warn("Inefficient Decompression for " + op + "  " + atype);
 				m1.decompress().binaryOperations(op, m2, ret);
@@ -190,26 +189,19 @@ public class CLALibBinaryCellOp {
 	}
 
 	private static boolean isValidForOverlappingBinaryCellOperations(CompressedMatrixBlock m1, BinaryOperator op) {
-		return m1.isOverlapping() && !(op.fn instanceof Multiply || op.fn instanceof Divide);
+		return m1.isOverlapping() && (op.fn instanceof Plus || op.fn instanceof Minus);
 	}
 
 	private static void overlappingBinaryCellOp(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret,
 		BinaryOperator op, boolean left) {
-		if(op.fn instanceof Plus || op.fn instanceof Minus)
-			binaryMVPlusStack(m1, m2, ret, op, left);
-		else
-			throw new NotImplementedException(op + " not implemented for Overlapping CLA");
-
+		binaryMVPlusStack(m1, m2, ret, op, left);
 	}
 
-	public static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, double[] v, CompressedMatrixBlock ret,
+	private static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, double[] v, CompressedMatrixBlock ret,
 		BinaryOperator op, boolean left) {
 
 		final List<AColGroup> oldColGroups = m1.getColGroups();
 
-		if(ret == null)
-			ret = new CompressedMatrixBlock(m1.getNumRows(), m1.getNumColumns());
-
 		final int k = op.getNumThreads();
 		final List<AColGroup> newColGroups = new ArrayList<>(oldColGroups.size());
 		final boolean isRowSafe = left ? op.isRowSafeLeft(v) : op.isRowSafeRight(v);
@@ -252,7 +244,6 @@ public class CLALibBinaryCellOp {
 			pool.shutdown();
 		}
 		catch(InterruptedException | ExecutionException e) {
-			e.printStackTrace();
 			throw new DMLRuntimeException(e);
 		}
 	}
@@ -287,8 +278,6 @@ public class CLALibBinaryCellOp {
 
 	protected static CompressedMatrixBlock binaryMVPlusStack(CompressedMatrixBlock m1, MatrixBlock m2,
 		CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
-		if(m2.isEmpty())
-			return m1;
 		final List<AColGroup> oldColGroups = m1.getColGroups();
 		final int size = oldColGroups.size();
 		final List<AColGroup> newColGroups = new ArrayList<>(size);
@@ -333,39 +322,48 @@ public class CLALibBinaryCellOp {
 
 	private static MatrixBlock binaryMVCol(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
 
-		MatrixBlock ret = new MatrixBlock(m1.getNumRows(), m1.getNumColumns(), false, -1).allocateBlock();
+		final int nCols = m1.getNumColumns();
+		final int nRows = m1.getNumRows();
+		// Pre filter.
+		final List<AColGroup> groups = m1.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			CompressedMatrixBlock mf1 = new CompressedMatrixBlock(m1);
+			double[] constV = new double[nCols];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			filteredGroups.add(ColGroupFactory.genColGroupConst(constV));
+			mf1.allocateColGroupList(filteredGroups);
+			m1 = mf1;
+		}
+		MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1).allocateBlock();
 
-		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / m1.getNumColumns() * 5;
+		final int blkz = CompressionSettings.BITMAP_BLOCK_SZ / nCols * 5;
 		final int k = op.getNumThreads();
 		long nnz = 0;
 
 		if(k <= 1) {
-			for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+			for(int i = 0; i < nRows; i += blkz) {
 				if(left)
-					nnz += new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op)
-						.call();
+					nnz += new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 				else
-					nnz += new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op).call();
+					nnz += new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op).call();
 			}
 		}
 		else {
 			ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
 			ArrayList<Callable<Integer>> tasks = new ArrayList<>();
 			try {
-				for(int i = 0; i * blkz < m1.getNumRows(); i++) {
+				for(int i = 0; i < nRows; i += blkz) {
 					if(left)
-						tasks.add(
-							new BinaryMVColLeftTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
+						tasks.add(new BinaryMVColLeftTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 					else
-						tasks.add(new BinaryMVColTask(m1, m2, ret, i * blkz, Math.min(m1.getNumRows(), (i + 1) * blkz), op));
-
+						tasks.add(new BinaryMVColTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op));
 				}
 				for(Future<Integer> f : pool.invokeAll(tasks))
 					nnz += f.get();
 				pool.shutdown();
 			}
 			catch(InterruptedException | ExecutionException e) {
-				e.printStackTrace();
 				throw new DMLRuntimeException(e);
 			}
 		}
@@ -396,7 +394,7 @@ public class CLALibBinaryCellOp {
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
@@ -440,7 +438,7 @@ public class CLALibBinaryCellOp {
 		public Integer call() {
 			// unsafe decompress, since we count nonzeros afterwards.
 			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(_ret, _rl, _ru);
+				g.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_m2.isInSparseFormat())
 				throw new NotImplementedException("Not Implemented sparse Format execution for MM.");
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
index 4a39eac..373c8af 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
@@ -35,6 +35,7 @@ import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.compress.colgroup.AColGroupOffset;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
@@ -407,14 +408,25 @@ public class CLALibCompAgg {
 	private static List<Future<MatrixBlock>> generateUnaryAggregateOverlappingFutures(CompressedMatrixBlock m1,
 		MatrixBlock ret, AggregateUnaryOperator op) throws InterruptedException {
 
-		ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
-		ArrayList<UnaryAggregateOverlappingTask> tasks = new ArrayList<>();
-
-		final int blklen = CompressionSettings.BITMAP_BLOCK_SZ / m1.getNumColumns() * 5;
-
-		for(int i = 0; i * blklen < m1.getNumRows(); i++)
-			tasks.add(
-				new UnaryAggregateOverlappingTask(m1, ret, i * blklen, Math.min((i + 1) * blklen, m1.getNumRows()), op));
+		final ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
+		final ArrayList<UAOverlappingTask> tasks = new ArrayList<>();
+		final int nCol = m1.getNumColumns();
+		final int nRow = m1.getNumRows();
+		final int blklen = CompressionSettings.BITMAP_BLOCK_SZ / nCol * 5;
+		final List<AColGroup> groups = m1.getColGroups();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[nCol];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			final AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
+			filteredGroups.add(cRet);
+			for(int i = 0; i < nRow; i += blklen)
+				tasks.add(new UAOverlappingTask(filteredGroups, ret, i, Math.min(i + blklen, nRow), op, nCol));
+		}
+		else {
+			for(int i = 0; i < nRow; i += blklen)
+				tasks.add(new UAOverlappingTask(groups, ret, i, Math.min(i + blklen, nRow), op, nCol));
+		}
 
 		List<Future<MatrixBlock>> futures = pool.invokeAll(tasks);
 		pool.shutdown();
@@ -532,38 +544,42 @@ public class CLALibCompAgg {
 		}
 	}
 
-	private static class UnaryAggregateOverlappingTask implements Callable<MatrixBlock> {
-		private final CompressedMatrixBlock _m1;
+	private static class UAOverlappingTask implements Callable<MatrixBlock> {
+		private final List<AColGroup> _groups;
 		private final int _rl;
 		private final int _ru;
 		private final MatrixBlock _ret;
 		private final AggregateUnaryOperator _op;
+		private final int _nCol;
 
-		protected UnaryAggregateOverlappingTask(CompressedMatrixBlock m1, MatrixBlock ret, int rl, int ru,
-			AggregateUnaryOperator op) {
-			_m1 = m1;
+		protected UAOverlappingTask(List<AColGroup> filteredGroups, MatrixBlock ret, int rl, int ru,
+			AggregateUnaryOperator op, int nCol) {
+			_groups = filteredGroups;
 			_op = op;
 			_rl = rl;
 			_ru = ru;
 			_ret = ret;
+			_nCol = nCol;
 		}
 
 		private MatrixBlock getTmp() {
 			MatrixBlock tmp = memPool.get();
 			if(tmp == null) {
-				memPool.set(new MatrixBlock(_ru - _rl, _m1.getNumColumns(), false, -1).allocateBlock());
+				memPool.set(new MatrixBlock(_ru - _rl, _nCol, false, -1).allocateBlock());
 				tmp = memPool.get();
 			}
 			else
-				tmp.reset(_ru - _rl, _m1.getNumColumns(), false, -1);
+				tmp.reset(_ru - _rl, _nCol, false, -1);
 
 			return tmp;
 		}
 
 		private MatrixBlock decompressToTemp() {
 			MatrixBlock tmp = getTmp();
-			for(AColGroup g : _m1.getColGroups())
-				g.decompressToBlock(tmp, _rl, _ru, -_rl, 0);
+
+			for(AColGroup g : _groups)
+				g.decompressToDenseBlock(tmp.getDenseBlock(), _rl, _ru, -_rl, 0);
+
 			tmp.setNonZeros(_rl + _ru);
 			return tmp;
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
index 558ca7b..b91b38d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java
@@ -20,7 +20,6 @@
 package org.apache.sysds.runtime.compress.lib;
 
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
@@ -31,13 +30,13 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.utils.DMLCompressionStatistics;
@@ -62,14 +61,15 @@ public class CLALibDecompress {
 
 	public static void decompressTo(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset, int colOffset, int k) {
 		Timing time = new Timing(true);
-
 		final boolean outSparse = ret.isInSparseFormat();
-		if(outSparse && cmb.isOverlapping())
-			throw new DMLCompressionException("Not supported decompression into sparse block from overlapping state");
-		else if(outSparse)
-			decompressToSparseBlock(cmb, ret, rowOffset, colOffset);
-		else
-			decompressToDenseBlock(cmb, ret, rowOffset, colOffset);
+		if(!cmb.isEmpty()) {
+			if(outSparse && cmb.isOverlapping())
+				throw new DMLCompressionException("Not supported decompression into sparse block from overlapping state");
+			else if(outSparse)
+				decompressToSparseBlock(cmb, ret, rowOffset, colOffset);
+			else
+				decompressToDenseBlock(cmb, ret.getDenseBlock(), rowOffset, colOffset);
+		}
 
 		if(DMLScript.STATISTICS) {
 			final double t = time.stop();
@@ -81,34 +81,44 @@ public class CLALibDecompress {
 
 	private static void decompressToSparseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
 		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
-		final int nRows = cmb.getNumRows();
 
-		for(AColGroup g : groups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+		final SparseBlock sb = ret.getSparseBlock();
+		final List<AColGroup> groups = cmb.getColGroups();
+		final int nRows = cmb.getNumRows();
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final MatrixBlock tmp = cmb.getUncompressed("Decompression to put into Sparse Block");
+			tmp.putInto(ret, rowOffset, colOffset, false);
+		}
+		else
+			for(AColGroup g : groups)
+				g.decompressToSparseBlock(sb, 0, nRows, rowOffset, colOffset);
 	}
 
-	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, MatrixBlock ret, int rowOffset,
-		int colOffset) {
-		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
+	private static void decompressToDenseBlock(CompressedMatrixBlock cmb, DenseBlock ret, int rowOffset, int colOffset) {
+		final List<AColGroup> groups = cmb.getColGroups();
 		// final int nCols = cmb.getNumColumns();
 		final int nRows = cmb.getNumRows();
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[cmb.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-
-		for(AColGroup g : filteredGroups)
-			g.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
-
-		if(constV != null) {
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		if(shouldFilter) {
+			final double[] constV = new double[cmb.getNumColumns()];
+			final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
+			for(AColGroup g : filteredGroups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 			AColGroup cRet = ColGroupFactory.genColGroupConst(constV);
-			cRet.decompressToBlock(ret, 0, nRows, rowOffset, colOffset);
+			cRet.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
+		}
+		else {
+			for(AColGroup g : groups)
+				g.decompressToDenseBlock(ret, 0, nRows, rowOffset, colOffset);
 		}
 	}
 
 	private static MatrixBlock decompressExecute(CompressedMatrixBlock cmb, int k) {
 
+		if(cmb.isEmpty())
+			return new MatrixBlock(cmb.getNumRows(), cmb.getNumColumns(), true);
 		// Copy column groups to make sure we can modify the list if we want to.
 		final List<AColGroup> groups = new ArrayList<>(cmb.getColGroups());
 		final int nRows = cmb.getNumRows();
@@ -122,34 +132,49 @@ public class CLALibDecompress {
 			ret.setNonZeros(ret.recomputeNonZeros());
 			return ret; // if uncompressedColGroup is only colGroup.
 		}
-		else if(ret == null) {
-			ret = new MatrixBlock(nRows, nCols, false, -1);
-			ret.allocateDenseBlock();
-		}
 
-		final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
-		final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		double[] constV = shouldFilter ? new double[nCols] : null;
+		final List<AColGroup> filteredGroups = shouldFilter ? CLALibUtils.filterGroups(groups, constV) : groups;
+
+		if(ret == null) { // There was no uncompressed group that fit the entire matrix.
+			final boolean sparse = !shouldFilter && !overlapping &&
+				MatrixBlock.evalSparseFormatInMemory(nRows, nCols, nonZeros);
+			ret = new MatrixBlock(nRows, nCols, sparse);
+			if(sparse)
+				ret.allocateSparseRowsBlock();
+			else
+				ret.allocateDenseBlock();
+		}
 
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		double[] constV = containsSDC ? new double[ret.getNumColumns()] : null;
-		final List<AColGroup> filteredGroups = containsSDC ? CLALibUtils.filterGroups(groups, constV) : groups;
-		if(LOG.isTraceEnabled())
-			LOG.debug("Decompressing with block size: " + blklen);
+		// final int block = (int) Math.ceil((double) (CompressionSettings.BITMAP_BLOCK_SZ) / nCols);
+		// final int blklen = Math.max(block, 64);
+		final int blklen = 32;
 
-		sortGroups(filteredGroups, overlapping);
+		// final int blklen = block > 1000 ? block + 1000 - block % 1000 : Math.max(64, block);
 
 		// check if we are using filtered groups, and if we are not force constV to null
 		if(groups == filteredGroups)
 			constV = null;
 
 		final double eps = getEps(constV);
-		if(k == 1)
-			decompressSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
-		else
-			decompressMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
-		if(overlapping)
-			ret.recomputeNonZeros();
+		if(k == 1) {
+			if(ret.isInSparseFormat()) {
+				decompressSparseSingleThread(ret, filteredGroups, nRows, blklen);
+				ret.setNonZeros(nonZeros);
+			}
+			else {
+				decompressDenseSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping);
+				ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
+			}
+		}
+		else if(ret.isInSparseFormat()) {
+			decompressSparseMultiThread(ret, filteredGroups, nRows, blklen, k);
+			ret.setNonZeros(nonZeros);
+		}
+		else
+			decompressDenseMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, overlapping, k);
 
 		ret.examSparsity();
 		return ret;
@@ -183,33 +208,46 @@ public class CLALibDecompress {
 		return ret;
 	}
 
-	private static void decompressSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
-		double[] constV, double eps, long nonZeros, boolean overlapping) {
+	private static void decompressSparseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen) {
+		final SparseBlock sb = ret.getSparseBlock();
+		for(int i = 0; i < rlen; i += blklen) {
+			final int rl = i;
+			final int ru = Math.min(i + blklen, rlen);
+			for(AColGroup grp : filteredGroups)
+				grp.decompressToSparseBlock(ret.getSparseBlock(), rl, ru);
+			for(int j = rl; j < ru; j++)
+				if(!sb.isEmpty(j))
+					sb.sort(j);
+		}
+
+	}
+
+	private static void decompressDenseSingleThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, double[] constV, double eps, long nonZeros, boolean overlapping) {
 		for(int i = 0; i < rlen; i += blklen) {
 			final int rl = i;
 			final int ru = Math.min(i + blklen, rlen);
 			for(AColGroup grp : filteredGroups)
-				grp.decompressToBlock(ret, rl, ru);
+				grp.decompressToDenseBlock(ret.getDenseBlock(), rl, ru);
 			if(constV != null && !ret.isInSparseFormat())
 				addVector(ret, constV, eps, rl, ru);
 		}
-		ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros);
 	}
 
-	private static void decompressMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
+	private static void decompressDenseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen, int blklen,
 		double[] constV, double eps, boolean overlapping, int k) {
 		try {
 			final ExecutorService pool = CommonThreadPool.get(k);
-			final ArrayList<DecompressTask> tasks = new ArrayList<>();
-			for(int i = 0; i * blklen < rlen; i++)
-				tasks.add(new DecompressTask(filteredGroups, ret, eps, i * blklen, Math.min((i + 1) * blklen, rlen),
-					overlapping, constV));
-			List<Future<Long>> rtasks = pool.invokeAll(tasks);
-			pool.shutdown();
+			final ArrayList<DecompressDenseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(
+					new DecompressDenseTask(filteredGroups, ret, eps, i, Math.min(i + blklen, rlen), overlapping, constV));
 
 			long nnz = 0;
-			for(Future<Long> rt : rtasks)
+			for(Future<Long> rt : pool.invokeAll(tasks))
 				nnz += rt.get();
+			pool.shutdown();
 			ret.setNonZeros(nnz);
 		}
 		catch(InterruptedException | ExecutionException ex) {
@@ -217,23 +255,21 @@ public class CLALibDecompress {
 		}
 	}
 
-	private static void sortGroups(List<AColGroup> groups, boolean overlapping) {
-		if(overlapping) {
-			// add a bit of stability in decompression
-			Comparator<AColGroup> comp = Comparator.comparing(x -> effect(x));
-			groups.sort(comp);
-		}
-	}
+	private static void decompressSparseMultiThread(MatrixBlock ret, List<AColGroup> filteredGroups, int rlen,
+		int blklen, int k) {
+		try {
+			final ExecutorService pool = CommonThreadPool.get(k);
+			final ArrayList<DecompressSparseTask> tasks = new ArrayList<>();
+			for(int i = 0; i < rlen; i += blklen)
+				tasks.add(new DecompressSparseTask(filteredGroups, ret, i, Math.min(i + blklen, rlen)));
 
-	/**
-	 * Calculate an effect value for a column group. This is used to sort the groups before decompression to decompress
-	 * the columns that have the smallest effect first.
-	 * 
-	 * @param x A Group
-	 * @return A Effect double value.
-	 */
-	private static double effect(AColGroup x) {
-		return (x instanceof ColGroupUncompressed) ? -Double.MAX_VALUE : -Math.max(x.getMax(), Math.abs(x.getMin()));
+			for(Future<Object> rt : pool.invokeAll(tasks))
+				rt.get();
+			pool.shutdown();
+		}
+		catch(InterruptedException | ExecutionException ex) {
+			throw new DMLCompressionException("Parallel decompression failed", ex);
+		}
 	}
 
 	/**
@@ -259,7 +295,7 @@ public class CLALibDecompress {
 		}
 	}
 
-	private static class DecompressTask implements Callable<Long> {
+	private static class DecompressDenseTask implements Callable<Long> {
 		private final List<AColGroup> _colGroups;
 		private final MatrixBlock _ret;
 		private final double _eps;
@@ -268,7 +304,7 @@ public class CLALibDecompress {
 		private final double[] _constV;
 		private final boolean _overlapping;
 
-		protected DecompressTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
+		protected DecompressDenseTask(List<AColGroup> colGroups, MatrixBlock ret, double eps, int rl, int ru,
 			boolean overlapping, double[] constV) {
 			_colGroups = colGroups;
 			_ret = ret;
@@ -282,7 +318,7 @@ public class CLALibDecompress {
 		@Override
 		public Long call() {
 			for(AColGroup grp : _colGroups)
-				grp.decompressToBlock(_ret, _rl, _ru);
+				grp.decompressToDenseBlock(_ret.getDenseBlock(), _rl, _ru);
 
 			if(_constV != null)
 				addVector(_ret, _constV, _eps, _rl, _ru);
@@ -291,6 +327,31 @@ public class CLALibDecompress {
 		}
 	}
 
+	private static class DecompressSparseTask implements Callable<Object> {
+		private final List<AColGroup> _colGroups;
+		private final MatrixBlock _ret;
+		private final int _rl;
+		private final int _ru;
+
+		protected DecompressSparseTask(List<AColGroup> colGroups, MatrixBlock ret, int rl, int ru) {
+			_colGroups = colGroups;
+			_ret = ret;
+			_rl = rl;
+			_ru = ru;
+		}
+
+		@Override
+		public Object call() {
+			final SparseBlock sb = _ret.getSparseBlock();
+			for(AColGroup grp : _colGroups)
+				grp.decompressToSparseBlock(_ret.getSparseBlock(), _rl, _ru);
+			for(int i = _rl; i < _ru; i++)
+				if(!sb.isEmpty(i))
+					sb.sort(i);
+			return null;
+		}
+	}
+
 	/**
 	 * Add the rowV vector to each row in ret.
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
index 919f98a..d46e071 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
@@ -32,9 +32,11 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
-import org.apache.sysds.runtime.compress.colgroup.AColGroupValue;
 import org.apache.sysds.runtime.compress.colgroup.APreAgg;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
+import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
@@ -88,11 +90,6 @@ public class CLALibLeftMultBy {
 			return prepareEmptyReturnMatrix(right, left, ret, true);
 		ret = prepareReturnMatrix(right, left, ret, true);
 		leftMultByCompressedTransposedMatrix(right, left, ret, k);
-
-		// fall back solution?
-		// MatrixBlock leftUc = left.getUncompressed();
-		// leftMultByMatrixTransposed(right, leftUc, ret, k);
-
 		return ret;
 	}
 
@@ -112,7 +109,7 @@ public class CLALibLeftMultBy {
 		if(left.isEmpty() || right.isEmpty())
 			return prepareEmptyReturnMatrix(right, left, ret, false);
 		ret = prepareReturnMatrix(right, left, ret, false);
-		ret = leftMultByMatrix(right.getColGroups(), left, ret, k, right.isOverlapping());
+		ret = LMM(right.getColGroups(), left, ret, k, right.isOverlapping());
 		return ret;
 	}
 
@@ -121,8 +118,8 @@ public class CLALibLeftMultBy {
 		final List<AColGroup> groups = cmb.getColGroups();
 		final int numColumns = cmb.getNumColumns();
 		final int numRows = cmb.getNumRows();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(groups);
-		final double[] constV = containsSDC ? new double[numColumns] : null;
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
+		final double[] constV = shouldFilter ? new double[numColumns] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
 
 		// TODO add parallel again
@@ -177,17 +174,21 @@ public class CLALibLeftMultBy {
 		final List<AColGroup> rightCG = right.getColGroups();
 		final List<AColGroup> leftCG = left.getColGroups();
 
-		final boolean containsRight = CLALibUtils.containsSDCOrConst(rightCG);
+		final boolean containsRight = CLALibUtils.shouldPreFilter(rightCG);
 		double[] cR = containsRight ? new double[cr] : null;
 		final List<AColGroup> fRight = CLALibUtils.filterGroups(rightCG, cR);
 
-		final boolean containsLeft = CLALibUtils.containsSDCOrConst(leftCG);
+		final boolean containsLeft = CLALibUtils.shouldPreFilter(leftCG);
 		double[] cL = containsLeft ? new double[rl] : null;
 		final List<AColGroup> fLeft = CLALibUtils.filterGroups(leftCG, cL);
 
-		for(int i = 0; i < fRight.size(); i++)
-			for(int j = 0; j < fLeft.size(); j++)
-				fRight.get(i).leftMultByAColGroup(fLeft.get(j), ret);
+		for(int j = 0; j < fLeft.size(); j++) {
+			final AColGroup lCg = fLeft.get(j);
+			for(int i = 0; i < fRight.size(); i++) {
+				final AColGroup rCg = fRight.get(i);
+				rCg.leftMultByAColGroup(lCg, ret);
+			}
+		}
 
 		double[] retV = ret.getDenseBlockValues();
 		if(containsLeft && containsRight)
@@ -237,30 +238,25 @@ public class CLALibLeftMultBy {
 		}
 	}
 
-	private static MatrixBlock leftMultByMatrix(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int k,
+	private static MatrixBlock LMM(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int k,
 		boolean overlapping) {
 
-		if(that.isEmpty()) {
-			ret.setNonZeros(0);
-			return ret;
-		}
-
 		final int numColumnsOut = ret.getNumColumns();
-		final boolean containsSDC = CLALibUtils.containsSDCOrConst(colGroups);
+		final boolean shouldFilter = CLALibUtils.shouldPreFilter(colGroups);
 		final int lr = that.getNumRows();
 
 		// a constant colgroup summing the default values.
-		double[] constV = containsSDC ? new double[numColumnsOut] : null;
+		double[] constV = shouldFilter ? new double[numColumnsOut] : null;
 		final List<AColGroup> filteredGroups = CLALibUtils.filterGroups(colGroups, constV);
 		if(colGroups == filteredGroups)
 			constV = null;
 		double[] rowSums;
-
 		if(!filteredGroups.isEmpty()) {
+			rowSums = shouldFilter ? new double[lr] : null;
 			if(k == 1)
-				rowSums = leftMultByMatrixPrimitive(filteredGroups, that, ret, 0, lr, containsSDC ? new double[lr] : null);
+				LMMPrimitive(filteredGroups, that, ret, 0, lr, rowSums);
 			else
-				rowSums = leftMultByMatrixParallel(filteredGroups, that, ret, containsSDC, overlapping, k);
+				LMMParallel(filteredGroups, that, ret, rowSums, overlapping, k);
 		}
 		else if(constV != null)
 			rowSums = that.rowSum(k).getDenseBlockValues();
@@ -274,25 +270,25 @@ public class CLALibLeftMultBy {
 		}
 
 		ret.recomputeNonZeros();
+		ret.examSparsity();
 		return ret;
 	}
 
-	private static double[] leftMultByMatrixParallel(List<AColGroup> filteredGroups, MatrixBlock that, MatrixBlock ret,
-		boolean calculateRowSums, boolean overlapping, int k) {
-		LOG.debug("Parallel left matrix multiplication");
+	private static void LMMParallel(List<AColGroup> filteredGroups, MatrixBlock that, MatrixBlock ret, double[] rowSums,
+		boolean overlapping, int k) {
+		LOG.debug("Parallel left matrix multiplication thatRows: " + that.getNumRows());
 		try {
 			final ExecutorService pool = CommonThreadPool.get(k);
 			final ArrayList<Callable<MatrixBlock>> tasks = new ArrayList<>();
 			final int rl = that.getNumRows();
-			final int rowBlockSize = rl <= k ? 1 : Math.min(Math.max(rl / k * 2, 1), 8);
-			final double[] rowSums = calculateRowSums ? new double[rl] : null;
-			final int numberSplits = Math.max((k / (rl / rowBlockSize)), 1);
+			final int numberSplits = Math.max((filteredGroups.size() / k), 1);
+			final int rowBlockThreads = Math.max(k / numberSplits, 1);
+			final int rowBlockSize = rl <= rowBlockThreads ? 1 : Math.min(Math.max(rl / rowBlockThreads, 1), 16);
 
 			if(numberSplits == 1) {
 				// no need to handle overlapping here, since outputs are in distinct locations
 				for(int blo = 0; blo < rl; blo += rowBlockSize)
-					tasks.add(new LeftMatrixColGroupMultTask(filteredGroups, that, ret, blo,
-						Math.min(blo + rowBlockSize, rl), rowSums));
+					tasks.add(new LMMTask(filteredGroups, that, ret, blo, Math.min(blo + rowBlockSize, rl), rowSums));
 
 				for(Future<MatrixBlock> future : pool.invokeAll(tasks))
 					future.get();
@@ -311,22 +307,25 @@ public class CLALibLeftMultBy {
 						if(tmpRet.getDenseBlock() == null)
 							tmpRet.allocateDenseBlock();
 						if(i == 0)
-							tasks.add(new LeftMatrixColGroupMultTask(gr, that, tmpRet, start, end, rowSums));
+							tasks.add(new LMMTask(gr, that, tmpRet, start, end, rowSums));
 						else
-							tasks.add(new LeftMatrixColGroupMultTask(gr, that, tmpRet, start, end, null));
+							tasks.add(new LMMTask(gr, that, tmpRet, start, end, null));
 					}
 				}
 				if(useTmp) {
+					// Add the overlapping outputs from each thread group to the output.
 					BinaryOperator op = new BinaryOperator(Plus.getPlusFnObject());
-					for(Future<MatrixBlock> future : pool.invokeAll(tasks))
-						ret.binaryOperationsInPlace(op, future.get());
+					for(Future<MatrixBlock> future : pool.invokeAll(tasks)) {
+						MatrixBlock mb = future.get();
+						mb.examSparsity();
+						ret.binaryOperationsInPlace(op, mb);
+					}
 				}
 				else
 					for(Future<MatrixBlock> future : pool.invokeAll(tasks))
 						future.get();
 			}
 			pool.shutdown();
-			return rowSums;
 		}
 		catch(InterruptedException | ExecutionException e) {
 			throw new DMLRuntimeException(e);
@@ -367,7 +366,7 @@ public class CLALibLeftMultBy {
 		}
 	}
 
-	private static class LeftMatrixColGroupMultTask implements Callable<MatrixBlock> {
+	private static class LMMTask implements Callable<MatrixBlock> {
 		private final List<AColGroup> _groups;
 		private final MatrixBlock _that;
 		private final MatrixBlock _ret;
@@ -375,8 +374,7 @@ public class CLALibLeftMultBy {
 		private final int _ru;
 		private final double[] _rowSums;
 
-		protected LeftMatrixColGroupMultTask(List<AColGroup> groups, MatrixBlock that, MatrixBlock ret, int rl, int ru,
-			double[] rowSums) {
+		protected LMMTask(List<AColGroup> groups, MatrixBlock that, MatrixBlock ret, int rl, int ru, double[] rowSums) {
 			_groups = groups;
 			_that = that;
 			_ret = ret;
@@ -388,7 +386,7 @@ public class CLALibLeftMultBy {
 		@Override
 		public MatrixBlock call() {
 			try {
-				leftMultByMatrixPrimitive(_groups, _that, _ret, _rl, _ru, _rowSums);
+				LMMPrimitive(_groups, _that, _ret, _rl, _ru, _rowSums);
 			}
 			catch(Exception e) {
 				e.printStackTrace();
@@ -398,110 +396,104 @@ public class CLALibLeftMultBy {
 		}
 	}
 
-	private static double[] leftMultByMatrixPrimitive(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret,
-		int rl, int ru, double[] rowSums) {
+	private static void LMMPrimitive(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int rl, int ru,
+		double[] rowSums) {
 		if(that.isInSparseFormat())
-			leftMultByMatrixPrimitiveSparse(colGroups, that, ret, rl, ru, rowSums);
+			LMMPrimitiveSparse(colGroups, that, ret, rl, ru, rowSums);
 		else
-			leftMultByMatrixPrimitiveDense(colGroups, that, ret, rl, ru, rowSums);
-		ret.setNonZeros(ret.getNumRows() * ret.getNumColumns()); // always assume dense, this is corrected later
-		return rowSums;
+			LMMPrimitiveDense(colGroups, that, ret, rl, ru, rowSums);
 	}
 
-	private static void leftMultByMatrixPrimitiveSparse(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret,
-		int rl, int ru, double[] rowSum) {
+	private static void LMMPrimitiveSparse(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int rl, int ru,
+		double[] rowSum) {
 
 		for(int i = rl; i < ru; i++) {
-			for(int j = 0; j < colGroups.size(); j++) {
+			final SparseBlock sb = that.getSparseBlock();
+			if(sb.isEmpty(i))
+				continue;
+			// row multiplication
+			for(int j = 0; j < colGroups.size(); j++)
 				colGroups.get(j).leftMultByMatrix(that, ret, i, i + 1);
-			}
+
 			if(rowSum != null) {
-				final SparseBlock sb = that.getSparseBlock();
-				if(!sb.isEmpty(i)) {
-					final int apos = sb.pos(i);
-					final int alen = sb.size(i) + apos;
-					final double[] aval = sb.values(i);
-					for(int j = apos; j < alen; j++)
-						rowSum[i] += aval[j];
-				}
+				final int apos = sb.pos(i);
+				final int alen = sb.size(i) + apos;
+				final double[] aval = sb.values(i);
+				for(int j = apos; j < alen; j++)
+					rowSum[i] += aval[j];
 			}
 		}
 	}
 
-	private static void leftMultByMatrixPrimitiveDense(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret,
-		int rl, int ru, double[] rowSum) {
+	private static void LMMPrimitiveDense(List<AColGroup> colGroups, MatrixBlock that, MatrixBlock ret, int rl, int ru,
+		double[] rowSum) {
 
-		final int numColsOut = ret.getNumColumns();
 		// Allocate a ColGroupValue array for the Column Groups of Value Type and multiply out any other columns.
 		final List<APreAgg> preAggCGs = preFilterAndMultiply(colGroups, that, ret, rl, ru);
-
+		/** The column block size for preAggregating column groups */
+		final int colBZ = 128;
 		// The number of rows to process together
-		final int rowBlockSize = 1;
+		final int rowBlockSize = 16;
 		// The number of column groups to process together
 		// the value should ideally be set so that the colGroups fits into cache together with a row block.
 		// currently we only try to avoid having a dangling small number of column groups in the last block.
-		final int colGroupBlocking = preAggCGs.size() % 16 < 4 ? 20 : 16;
+		// final int colGroupBlocking = preAggCGs.size() ;// % 16 < 4 ? 20 : 16;
+		final int colGroupBlocking = 8;
+		// final int colGroupBlocking = 4;
+		final int nColGroups = preAggCGs.size();
 
 		// Allocate pre Aggregate Array List
 		final MatrixBlock[] preAgg = populatePreAggregate(colGroupBlocking);
 
-		// Allocate temporary Result matrix.
-		final MatrixBlock tmpRes = new MatrixBlock(rowBlockSize, numColsOut, false);
+		// Allocate temporary Result matrix
+		// guaranteed to be large enough for all groups
+		final MatrixBlock tmpRes = new MatrixBlock(rowBlockSize, ret.getNumColumns(), false);
 
 		final int lc = that.getNumColumns();
+		// For each row block
+		for(int rlt = rl; rlt < ru; rlt += rowBlockSize) {
+			final int rut = Math.min(rlt + rowBlockSize, ru);
+			// For each column group block
+			for(int gl = 0; gl < nColGroups; gl += colGroupBlocking) {
+				final int gu = Math.min(gl + colGroupBlocking, nColGroups);
+				// For each column group in the current block allocate the preaggregate array.
+				for(int j = gl; j < gu; j++) {
+					final int preAggNCol = preAggCGs.get(j).getPreAggregateSize();
+					preAgg[j % colGroupBlocking].reset(rut - rlt, preAggNCol, false);
+				}
 
-		// For each column group block
-		for(int g = 0; g < preAggCGs.size(); g += colGroupBlocking) {
-			final int gEnd = Math.min(g + colGroupBlocking, preAggCGs.size());
-
-			// For each column group in the current block allocate the preaggregate array.
-			for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
-				AColGroupValue cg = preAggCGs.get(j);
-				int nVals = cg.getNumValues();
-				preAgg[j % colGroupBlocking].reset(rowBlockSize, nVals, false);
-			}
-
-			int colBlockSize = 32000;
-
-			// For each row block
-			for(int h = rl; h < ru; h += rowBlockSize) {
-				// For each column block
-				final int rowUpper = Math.min(h + rowBlockSize, ru);
-				for(int i = 0; i < lc; i += colBlockSize) {
-					final int colUpper = Math.min(i + colBlockSize, lc);
-					// Pre Aggregate each column group in block
-					for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
-						preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], h, rowUpper, i, colUpper);
-					}
-					if(rowSum != null) {
-						final double[] thatV = that.getDenseBlockValues();
-						for(int r = h; r < rowUpper; r++) {
-							final int rowOff = r * lc;
-							for(int c = rowOff + i; c < rowOff + colUpper; c++)
-								rowSum[r] += thatV[c];
-						}
-					}
+				// PreAggregate current block of column groups
+				for(int cl = 0; cl < lc; cl += colBZ) {
+					final int cu = Math.min(cl + colBZ, lc);
+					for(int j = gl; j < gu; j++)
+						preAggCGs.get(j).preAggregateDense(that, preAgg[j % colGroupBlocking], rlt, rut, cl, cu);
+					if(gu == nColGroups)
+						rowSum(that, rowSum, rlt, rut, cl, cu);
 				}
 
-				// Multiply out the preAggregate to the output matrix.
-				for(int j = g; j < gEnd && j < preAggCGs.size(); j++) {
-					AColGroupValue vj = preAggCGs.get(j);
-					MatrixBlock preAggJ = preAgg[j % colGroupBlocking];
-					preAggJ.recomputeNonZeros();
-					tmpRes.reset(rowBlockSize, vj.getNumCols(), false);
-					MatrixBlock tmp = vj.leftMultByPreAggregateMatrix(preAggJ, tmpRes);
-					vj.addMatrixToResult(tmp, ret, h, Math.min(h + rowBlockSize, ru));
-					preAggJ.reset();
+				// Multiply out the PreAggregate to the output matrix.
+				for(int j = gl; j < gu; j++) {
+					final APreAgg cg = preAggCGs.get(j);
+					final MatrixBlock preAggThis = preAgg[j % colGroupBlocking];
+					MMPreaggregate(cg, preAggThis, tmpRes, ret, rlt, rut);
 				}
 			}
 		}
 
-		if(preAggCGs.size() == 0 && rowSum != null) {
-			final double[] thatV = that.getDenseBlockValues();
... 2969 lines suppressed ...

[systemds] 02/03: [MINOR] Revert processAddRow to not use compressed

Posted by ba...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 148092c8f15278dd6c87fbf0321c4c3902b34a7e
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Mon Dec 13 17:20:49 2021 +0100

    [MINOR] Revert processAddRow to not use compressed
    
    Previously I added binaryMVRow op for compressed, in processAddRow,
    but since then extra checks were added to binary row risking removing
    the output in this special case, to make it consistent for now
    I revert to use uncompressed operations.
---
 .../java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java   | 9 ++++-----
 src/test/java/org/apache/sysds/test/TestUtils.java               | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index 922f7ab..65af775 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -48,7 +48,6 @@ import org.apache.sysds.lops.MapMultChain.ChainType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
-import org.apache.sysds.runtime.compress.lib.CLALibBinaryCellOp;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
@@ -3805,10 +3804,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab
 		for( MatrixBlock in : inputs ) {
 			if( in.isEmptyBlock(false) )
 				continue;
-			if(in instanceof CompressedMatrixBlock){
-				in = CLALibBinaryCellOp.binaryMVRow((CompressedMatrixBlock) in,c, null, new BinaryOperator(Plus.getPlusFnObject()), false);
-			}
-			else if( in.isInSparseFormat() ) {
+			if(in instanceof CompressedMatrixBlock)
+				in = CompressedMatrixBlock.getUncompressed(in, "ProcessAddRow");
+			
+			if( in.isInSparseFormat() ) {
 				SparseBlock a = in.getSparseBlock();
 				if( a.isEmpty(i) ) continue;
 				LibMatrixMult.vectAdd(a.values(i), c, a.indexes(i), a.pos(i), cix, a.size(i));
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 125de36..a0ba5bf 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -918,7 +918,7 @@ public class TestUtils
 				continue;
 			
 			if(sba.size(i) != sbe.size(i))
-				fail(message+"\nNumber of values are not equal in row: " + i);
+				fail(message+"\nNumber of values are not equal in row: " + i +"\nactual:"+ sba.get(i) +"\nexpected:"+ sbe.get(i));
 
 			final double[] e = sbe.values(i);
 			final double[] a = sba.values(i);