You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/07/17 00:23:31 UTC

[6/6] incubator-systemml git commit: [SYSTEMML-810] New compressed matrix blocks and operations, tests

[SYSTEMML-810] New compressed matrix blocks and operations, tests

For details, see https://issues.apache.org/jira/browse/SYSTEMML-449.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16e7b1c8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16e7b1c8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16e7b1c8

Branch: refs/heads/master
Commit: 16e7b1c88e45e007d1db229717311fcf70bc6b19
Parents: 71013e7
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Sat Jul 16 00:43:16 2016 -0700
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Sat Jul 16 17:22:50 2016 -0700

----------------------------------------------------------------------
 .../runtime/compress/BitmapDecoderOLE.java      |  127 ++
 .../runtime/compress/BitmapDecoderRLE.java      |  117 ++
 .../sysml/runtime/compress/BitmapEncoder.java   |  392 +++++
 .../apache/sysml/runtime/compress/ColGroup.java |  259 ++++
 .../sysml/runtime/compress/ColGroupBitmap.java  |  498 +++++++
 .../sysml/runtime/compress/ColGroupOLE.java     |  634 +++++++++
 .../sysml/runtime/compress/ColGroupRLE.java     |  617 ++++++++
 .../runtime/compress/ColGroupUncompressed.java  |  360 +++++
 .../runtime/compress/CompressedMatrixBlock.java | 1342 ++++++++++++++++++
 .../runtime/compress/PlanningBinPacker.java     |  191 +++
 .../sysml/runtime/compress/PlanningCoCoder.java |  227 +++
 .../runtime/compress/PlanningCoCodingGroup.java |  104 ++
 .../compress/PlanningGroupMergeAction.java      |   73 +
 .../runtime/compress/ReaderColumnSelection.java |   64 +
 .../compress/ReaderColumnSelectionDense.java    |   68 +
 .../ReaderColumnSelectionDenseSample.java       |   86 ++
 .../compress/ReaderColumnSelectionSparse.java   |  115 ++
 .../runtime/compress/UncompressedBitmap.java    |  101 ++
 .../compress/estim/CompressedSizeEstimator.java |  145 ++
 .../estim/CompressedSizeEstimatorExact.java     |   53 +
 .../estim/CompressedSizeEstimatorSample.java    |  767 ++++++++++
 .../compress/estim/CompressedSizeInfo.java      |   69 +
 .../compress/estim/SizeEstimatorFactory.java    |   40 +
 .../runtime/compress/utils/ConverterUtils.java  |   99 ++
 .../sysml/runtime/compress/utils/DblArray.java  |   91 ++
 .../compress/utils/DblArrayIntListHashMap.java  |  179 +++
 .../compress/utils/DoubleIntListHashMap.java    |  181 +++
 .../runtime/compress/utils/IntArrayList.java    |  102 ++
 .../compress/utils/LinearAlgebraUtils.java      |  383 +++++
 .../runtime/functionobjects/KahanFunction.java  |    8 +
 .../runtime/functionobjects/KahanPlus.java      |    5 +
 .../runtime/functionobjects/KahanPlusSq.java    |   15 +-
 .../runtime/matrix/data/LibMatrixMult.java      |   27 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |    2 +-
 .../compress/BasicCompressionTest.java          |  168 +++
 .../compress/BasicMatrixAppendTest.java         |  176 +++
 .../compress/BasicMatrixMultChainTest.java      |  245 ++++
 .../BasicMatrixTransposeSelfMultTest.java       |  172 +++
 .../compress/BasicMatrixVectorMultTest.java     |  180 +++
 .../BasicScalarOperationsSparseUnsafeTest.java  |  177 +++
 .../compress/BasicScalarOperationsTest.java     |  177 +++
 .../BasicTransposeSelfLeftMatrixMultTest.java   |  172 +++
 .../compress/BasicUnaryAggregateTest.java       |  544 +++++++
 .../compress/BasicVectorMatrixMultTest.java     |  180 +++
 .../functions/compress/CompressedLinregCG.java  |  151 ++
 .../compress/CompressedSerializationTest.java   |  185 +++
 .../compress/LargeCompressionTest.java          |  169 +++
 .../compress/LargeMatrixVectorMultTest.java     |  180 +++
 .../compress/LargeParMatrixVectorMultTest.java  |  182 +++
 .../compress/LargeVectorMatrixMultTest.java     |  180 +++
 .../compress/ParMatrixMultChainTest.java        |  247 ++++
 .../compress/ParMatrixVectorMultTest.java       |  182 +++
 .../ParTransposeSelfLeftMatrixMultTest.java     |  174 +++
 .../compress/ParUnaryAggregateTest.java         |  547 +++++++
 .../compress/ParVectorMatrixMultTest.java       |  181 +++
 src/test/scripts/functions/compress/LinregCG.R  |   57 +
 .../scripts/functions/compress/LinregCG.dml     |   56 +
 .../compress/SystemML-config-compress.xml       |   59 +
 .../functions/compress/ZPackageSuite.java       |   56 +
 59 files changed, 12322 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
new file mode 100644
index 0000000..fc6e861
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+/**
+ * General-purpose iterator to decode a compressed OLE bitmap.
+ *  
+ */
+public final class BitmapDecoderOLE implements Iterator<Integer> 
+{
+	// pointer to the compressed bitmap
+	private int _bmOff;
+	private int _bmLen;
+	private char[] _bmPtr;
+
+	// The index of the current block. Block 0 covers bits 1 through 2^16
+	private int _blockIx;
+
+	// The offset where the current block starts within the bitmap.
+	private int _blockStartOffset;
+
+	// The number of offsets in the current block.
+	private int _curBlockSize;
+
+	// The offset <b>in the current block</b> the <b>next</b> element we will
+	// read from the bitmap, or bmPtr.length if we are done.
+	private int _nextBmOffset;
+
+	/**
+	 * Point this object at the beginning of a particular bitmap. After a call
+	 * to this method, the next call to {@link #nextOffset()} will return the
+	 * offset of the first bit in the specified bitmap.
+	 * 
+	 * @param bmPtr
+	 *            pointer to a compressed bitmap
+	 */
+	public BitmapDecoderOLE(char[] bmPtr, int off, int len) {
+		_bmOff = off;
+		_bmLen = len;
+		_bmPtr = bmPtr;
+		_blockIx = 0;
+		_blockStartOffset = 0;
+		_curBlockSize = _bmPtr[_bmOff+_blockStartOffset];
+		if (_curBlockSize < 0) {
+			throw new RuntimeException(String.format(
+					"Negative block size %d at position %d of %s",
+					_curBlockSize, _blockStartOffset, Arrays.toString(bmPtr)));
+		}
+		_nextBmOffset = 0;
+
+		// Advance past any zero-length blocks at the beginning of the array
+		while (_blockStartOffset < _bmLen
+				&& _nextBmOffset >= _curBlockSize) {
+			advanceToNextBlock();
+		}
+	}
+
+	@Override
+	public Integer next() {
+		if( !hasNext() )
+			throw new RuntimeException("No next offset existing.");
+		
+		// Grab the lookahead value Note the +1 in the array indexing; 
+		// the first number in a block is the block size
+		int offsetFromBlockBegin = _bmPtr[_bmOff+_blockStartOffset + 1 + _nextBmOffset];
+		int ret = (_blockIx * BitmapEncoder.BITMAP_BLOCK_SZ)
+				+ offsetFromBlockBegin;
+		_nextBmOffset++;
+
+		// Advance to next non-empty block if we reached the end of the block.
+		while (_blockStartOffset < _bmLen && _nextBmOffset >= _curBlockSize) {
+			advanceToNextBlock();
+		}
+
+		return ret;
+	}
+
+	@Override
+	public boolean hasNext() {
+		return _blockStartOffset < _bmLen;
+	}
+
+	@Override
+	public void remove() {
+		throw new RuntimeException("Not implemented for BitmapDecoderOLE.");
+	}
+
+	/**
+	 * Move forward to the next block. Does not skip empty blocks.
+	 */
+	private void advanceToNextBlock() {
+		_blockStartOffset += (1 + _curBlockSize);
+		_blockIx++;
+		if (_blockStartOffset >= _bmLen) {
+			// Read past last block
+			return;
+		}
+
+		_curBlockSize = _bmPtr[_bmOff+_blockStartOffset];
+		if (_curBlockSize < 0) {
+			throw new RuntimeException(String.format(
+					"Negative block size %d at position %d of %s",
+					_curBlockSize, _blockStartOffset, Arrays.toString(_bmPtr)));
+		}
+		_nextBmOffset = 0;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
new file mode 100644
index 0000000..54f24ae
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Iterator;
+
+/**
+ * General-purpose iterator to decode a compressed OLE bitmap.
+ * 
+ */
+public final class BitmapDecoderRLE implements Iterator<Integer>
+{
+	// pointer to the compressed bitmap
+	private int _bmOff;
+	private int _bmLen;
+	private char[] _bmPtr;
+
+	// The offset of the <b>next</b> element we will read from the bitmap, or
+	// bmPtr.length if we are done.
+	private int _nextBmOffset;
+
+	// The offset in the matrix column of the beginning of the current run
+	private int _runStartOffset;
+
+	// The length of the current run
+	private int _curRunLen;
+
+	// The number of bits that we have returned from the current run.
+	private int _runBitsReturned;
+
+	/**
+	 * Point this object at the beginning of a particular bitmap. After a call
+	 * to this method, the next call to {@link #nextOffset()} will return the
+	 * offset of the first bit in the specified bitmap.
+	 * 
+	 * @param bmPtr
+	 *            pointer to a compressed bitmap
+	 */
+	public BitmapDecoderRLE(char[] bmPtr, int off, int len) {
+		_bmOff = off;
+		_bmLen = len;
+		_bmPtr = bmPtr;
+		_nextBmOffset = 0;
+		_runStartOffset = 0;
+		_curRunLen = 0;
+		_runBitsReturned = 0;
+
+		if (0 == _bmLen) {
+			return; //no runs
+		}
+
+		// Advance to the beginning of the first non-empty run.
+		advanceToNextRun();
+	}
+
+	@Override
+	public Integer next() {
+		if( !hasNext() )
+			throw new RuntimeException("No next offset existing.");
+		
+		// Grab the lookahead value
+		int ret = _runStartOffset + _runBitsReturned;
+
+		_runBitsReturned++;
+
+		// Check for end of run
+		if (_runBitsReturned == _curRunLen) {
+			advanceToNextRun();
+		}
+
+		return ret;
+	}
+
+	@Override
+	public boolean hasNext() {
+		return _runBitsReturned < _curRunLen;
+	}
+	
+	@Override
+	public void remove() {
+		throw new RuntimeException("Not implemented for BitmapDecoderRLE.");
+	}
+	
+	/** Move forward to the next non-empty run. */
+	private void advanceToNextRun() {
+		// While loop needed because some runs are of length 0
+		while (_runBitsReturned == _curRunLen && _nextBmOffset < _bmLen) {
+
+			_runBitsReturned = 0;
+
+			// Read the distance to the next run
+			char delta = _bmPtr[_bmOff + _nextBmOffset];
+
+			// Run length is stored in the next element of the array
+			_runStartOffset += delta + _curRunLen;
+			_curRunLen = _bmPtr[_bmOff + _nextBmOffset + 1];
+			_nextBmOffset += 2;
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java b/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
new file mode 100644
index 0000000..8a535e1
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.compress.utils.DblArrayIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.DoubleIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.IntArrayList;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+
+
+/** 
+ * Static functions for encoding bitmaps in various ways. 
+ * 
+ */
+public class BitmapEncoder 
+{
+	/** Size of the blocks used in a blocked bitmap representation. */
+	public static final int BITMAP_BLOCK_SZ = 65536;
+	
+	/**
+	 * Generate uncompressed bitmaps for a set of columns in an uncompressed
+	 * matrix block.
+	 * 
+	 * @param colIndices
+	 *            indexes (within the block) of the columns to extract
+	 * @param rawblock
+	 *            an uncompressed matrix block; can be dense or sparse
+	 * @return uncompressed bitmap representation of the columns
+	 * @throws DMLRuntimeException 
+	 */
+	public static UncompressedBitmap extractBitmap(int[] colIndices, MatrixBlock rawblock) 
+	{
+		//note: no sparse column selection reader because low potential
+		//single column selection
+		if( colIndices.length==1 ) {
+			return extractBitmap(colIndices[0], rawblock, 
+					!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+		}
+		//multiple column selection	(general case)
+		else { 
+			ReaderColumnSelection reader = null;
+			if( rawblock.isInSparseFormat() && CompressedMatrixBlock.TRANSPOSE_INPUT )
+				reader = new ReaderColumnSelectionSparse(rawblock, colIndices, 
+						!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+			else
+				reader = new ReaderColumnSelectionDense(rawblock, colIndices,
+						!CompressedMatrixBlock.MATERIALIZE_ZEROS); 
+			
+			return extractBitmap(colIndices, rawblock, reader);
+		}
+	}
+
+	/**
+	 * 
+	 * @param colIndices
+	 * @param rawblock
+	 * @param sampleIndexes
+	 * @return
+	 */
+	public static UncompressedBitmap extractBitmapFromSample(int[] colIndices,
+			MatrixBlock rawblock, int[] sampleIndexes) 
+	{
+		//note: no sparse column selection reader because low potential
+		
+		//single column selection
+		if( colIndices.length==1 ) {
+			return extractBitmap(colIndices[0], rawblock, sampleIndexes,
+					!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+		}
+		//multiple column selection	(general case)
+		else {			
+			return extractBitmap(colIndices, rawblock,
+					new ReaderColumnSelectionDenseSample(rawblock, colIndices,
+							sampleIndexes, !CompressedMatrixBlock.MATERIALIZE_ZEROS));	
+		}
+	}
+
+	/**
+	 * Encodes the bitmap as a series of run lengths and offsets.
+	 * <p>
+	 * <b>NOTE: This method must be kept in sync with {@link BitmapDecoderRLE}
+	 * !</b>
+	 * 
+	 * @param offsets
+	 *            uncompressed contents of the bitmap, expressed as a list of
+	 *            the offsets of different bits
+	 * @return compressed version of said bitmap
+	 */
+	public static char[] genRLEBitmap(int[] offsets) {
+		if( offsets.length == 0 )
+			return new char[0]; //empty list
+
+		// Use an ArrayList for correctness at the expense of temp space
+		ArrayList<Character> buf = new ArrayList<Character>();
+
+		// 1 + (position of last 1 in the previous run of 1's)
+		// We add 1 because runs may be of length zero.
+		int lastRunEnd = 0;
+
+		// Offset between the end of the previous run of 1's and the first 1 in
+		// the current run. Initialized below.
+		int curRunOff;
+
+		// Length of the most recent run of 1's
+		int curRunLen = 0;
+
+		// Current encoding is as follows:
+		// Negative entry: abs(Entry) encodes the offset to the next lone 1 bit.
+		// Positive entry: Entry encodes offset to next run of 1's. The next
+		// entry in the bitmap holds a run length.
+
+		// Special-case the first run to simplify the loop below.
+		int firstOff = offsets[0];
+
+		// The first run may start more than a short's worth of bits in
+		while (firstOff > Character.MAX_VALUE) {
+			buf.add(Character.MAX_VALUE);
+			buf.add((char) 0);
+			firstOff -= Character.MAX_VALUE;
+			lastRunEnd += Character.MAX_VALUE;
+		}
+
+		// Create the first run with an initial size of 1
+		curRunOff = firstOff;
+		curRunLen = 1;
+
+		// Process the remaining offsets
+		for (int i = 1; i < offsets.length; i++) {
+
+			int absOffset = offsets[i];
+
+			// 1 + (last position in run)
+			int curRunEnd = lastRunEnd + curRunOff + curRunLen;
+
+			if (absOffset > curRunEnd || curRunLen >= Character.MAX_VALUE) {
+				// End of a run, either because we hit a run of 0's or because the 
+				// number of 1's won't fit in 16 bits. Add run to bitmap and start a new one.
+				buf.add((char) curRunOff);
+				buf.add((char) curRunLen);
+
+				lastRunEnd = curRunEnd;
+				curRunOff = absOffset - lastRunEnd;
+
+				while (curRunOff > Character.MAX_VALUE) {
+					// SPECIAL CASE: Offset to next run doesn't fit into 16 bits.
+					// Add zero-length runs until the offset is small enough.
+					buf.add(Character.MAX_VALUE);
+					buf.add((char) 0);
+					lastRunEnd += Character.MAX_VALUE;
+					curRunOff -= Character.MAX_VALUE;
+				}
+				
+				curRunLen = 1;
+			} else {
+				// Middle of a run
+				curRunLen++;
+			}
+		}
+
+		// Close out the last run
+		if (curRunLen >= 1) {
+			buf.add((char) curRunOff);
+			buf.add((char) curRunLen);
+		}
+
+		// Convert wasteful ArrayList to packed array.
+		char[] ret = new char[buf.size()];
+		for (int i = 0; i < buf.size(); i++) {
+			ret[i] = buf.get(i);
+		}
+		return ret;
+	}
+
+	/**
+	 * Encodes the bitmap in blocks of offsets. Within each block, the bits are
+	 * stored as absolute offsets from the start of the block.
+	 * 
+	 * @param offsets
+	 *            uncompressed contents of the bitmap, expressed as a list of
+	 *            the offsets of different bits
+	 * @return compressed version of said bitmap
+	 */
+	public static char[] genOffsetBitmap(int[] offsets) 
+	{
+		int lastOffset = offsets[offsets.length - 1];
+
+		// Build up the blocks
+		int numBlocks = (lastOffset / BITMAP_BLOCK_SZ) + 1;
+		// To simplify the logic, we make two passes.
+		// The first pass divides the offsets by block.
+		int[] blockLengths = new int[numBlocks];
+		Arrays.fill(blockLengths, 0);
+
+		for (int ix = 0; ix < offsets.length; ix++) {
+			int val = offsets[ix];
+			int blockForVal = val / BITMAP_BLOCK_SZ;
+
+			blockLengths[blockForVal]++;
+		}
+
+		// The second pass creates the blocks.
+		int totalSize = numBlocks;
+		for (int block = 0; block < numBlocks; block++) {
+			totalSize += blockLengths[block];
+		}
+		char[] encodedBlocks = new char[totalSize];
+
+		int inputIx = 0;
+		int blockStartIx = 0;
+		for (int block = 0; block < numBlocks; block++) {
+			int blockSz = blockLengths[block];
+
+			// First entry in the block is number of bits
+			encodedBlocks[blockStartIx] = (char) blockSz;
+
+			for (int i = 0; i < blockSz; i++) {
+				encodedBlocks[blockStartIx + i + 1] = (char) 
+						(offsets[inputIx+i] % BITMAP_BLOCK_SZ);
+			}
+
+			inputIx += blockSz;
+			blockStartIx += blockSz + 1;
+		}
+
+		return encodedBlocks;
+	}
+
+	/**
+	 * 
+	 * @param colIndex
+	 * @param rawblock
+	 * @return
+	 * @throws DMLRuntimeException 
+	 */
+	private static UncompressedBitmap extractBitmap(int colIndex, MatrixBlock rawblock, boolean skipZeros) 
+	{
+		//probe map for distinct items (for value or value groups)
+		DoubleIntListHashMap distinctVals = new DoubleIntListHashMap();
+		
+		//scan rows and probe/build distinct items
+		final int m = CompressedMatrixBlock.TRANSPOSE_INPUT ?
+				rawblock.getNumColumns():rawblock.getNumRows();
+		
+		if( rawblock.isInSparseFormat() //SPARSE 
+			&& CompressedMatrixBlock.TRANSPOSE_INPUT )	
+		{
+			SparseBlock a = rawblock.getSparseBlock();
+			if( a != null && !a.isEmpty(colIndex) ) 
+			{
+				int apos = a.pos(colIndex);
+				int alen = a.size(colIndex);
+				int[] aix = a.indexes(colIndex);
+				double[] avals = a.values(colIndex);
+				
+				IntArrayList lstPtr0 = new IntArrayList(); //for 0 values
+				int last = -1;
+				//iterate over non-zero entries but fill in zeros
+				for( int j=apos; j<apos+alen; j++ ) 
+				{
+					//fill in zero values
+					if( !skipZeros )
+						for( int k=last+1; k<aix[j]; k++ ) 
+							lstPtr0.appendValue(k);
+					//handle non-zero value
+					IntArrayList lstPtr = distinctVals.get(avals[j]);	
+					if( lstPtr == null ) {
+						lstPtr = new IntArrayList();
+						distinctVals.appendValue(avals[j], lstPtr);
+					}
+					lstPtr.appendValue(aix[j]);
+					last = aix[j];
+				}
+				//fill in remaining zero values
+				if( !skipZeros ) {
+					for( int k=last+1; k<m; k++ )
+						lstPtr0.appendValue(k);
+					if( lstPtr0.size()>0 )
+						distinctVals.appendValue(0, lstPtr0);
+				}
+			}
+			else if( !skipZeros ) { //full 0 column 
+				IntArrayList lstPtr = new IntArrayList();
+				for( int i=0; i<m; i++ )
+					lstPtr.appendValue(i);
+				distinctVals.appendValue(0, lstPtr);
+			}
+		}
+		else //GENERAL CASE
+		{
+			for( int i=0; i<m; i++ ) {
+				double val = CompressedMatrixBlock.TRANSPOSE_INPUT ? 
+						rawblock.quickGetValue(colIndex, i):
+						rawblock.quickGetValue(i, colIndex);
+				if( val!=0 || !skipZeros ) {		
+					IntArrayList lstPtr = distinctVals.get(val);	
+					if( lstPtr == null ) {
+						lstPtr = new IntArrayList();
+						distinctVals.appendValue(val, lstPtr);
+					}
+					lstPtr.appendValue(i);
+				}
+			}
+		}
+		
+		return new UncompressedBitmap(distinctVals);
+	}
+	
+	/**
+	 * 
+	 * @param colIndex
+	 * @param rawblock
+	 * @param sampleIndexes
+	 * @return
+	 */
+	private static UncompressedBitmap extractBitmap(int colIndex, MatrixBlock rawblock, int[] sampleIndexes, boolean skipZeros) 
+	{
+		//note: general case only because anyway binary search for small samples
+		
+		//probe map for distinct items (for value or value groups)
+		DoubleIntListHashMap distinctVals = new DoubleIntListHashMap();
+		
+		//scan rows and probe/build distinct items
+		final int m = sampleIndexes.length;
+		for( int i=0; i<m; i++ ) {
+			int rowIndex = sampleIndexes[i]; 
+			double val = CompressedMatrixBlock.TRANSPOSE_INPUT ? 
+					rawblock.quickGetValue(colIndex, rowIndex) : 
+					rawblock.quickGetValue(rowIndex, colIndex); 
+			if( val!=0 || !skipZeros ) {					
+				IntArrayList lstPtr = distinctVals.get(val);	
+				if( lstPtr == null ) {
+					lstPtr = new IntArrayList();
+					distinctVals.appendValue(val, lstPtr);
+				}
+				lstPtr.appendValue(i);
+			}
+		}
+
+		return new UncompressedBitmap(distinctVals);
+	}
+	
+	/**
+	 * 
+	 * @param colIndices
+	 * @param rawblock
+	 * @param rowReader
+	 * @return
+	 */
+	private static UncompressedBitmap extractBitmap(int[] colIndices,
+			MatrixBlock rawblock, ReaderColumnSelection rowReader) 
+	{
+		//probe map for distinct items (for value or value groups)
+		DblArrayIntListHashMap distinctVals = new DblArrayIntListHashMap();
+		
+		//scan rows and probe/build distinct items
+		DblArray cellVals = null;
+		while ((cellVals = rowReader.nextRow()) != null) {
+			IntArrayList lstPtr = distinctVals.get(cellVals);
+			if (lstPtr == null) {
+				//create new objects only on demand
+				lstPtr = new IntArrayList();
+				distinctVals.appendValue(new DblArray(cellVals), lstPtr);
+			}
+			lstPtr.appendValue(rowReader.getCurrentRowIndex());
+		}
+		
+		return new UncompressedBitmap(distinctVals, colIndices.length);
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
new file mode 100644
index 0000000..647a701
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * Class that stores information about a column group within a compressed matrix
+ * block. There are subclasses specific to each compression type.
+ * 
+ */
+public abstract class ColGroup implements Serializable 
+{
+	private static final long serialVersionUID = 2439785418908671481L;
+
+	public enum CompressionType  {
+		UNCOMPRESSED,   //uncompressed sparse/dense 
+		RLE_BITMAP,     //RLE bitmap
+		OLE_BITMAP;  //OLE bitmap
+	}
+	
+	/**
+	 * Offsets of the columns that make up the column group. Zero-based, and
+	 * relative to the matrix block.
+	 */
+	protected int[] _colIndexes;
+
+	/** Number of rows in the matrix, for use by child classes. */
+	protected int _numRows;
+
+	/** How the elements of the column group are compressed. */
+	private CompressionType _compType;
+
+	
+	/**
+	 * Main constructor.
+	 * 
+	 * @param colIndices
+	 *            offsets of the columns in the matrix block that make up the
+	 *            group
+	 * @param numRows
+	 *            total number of rows in the parent block
+	 */
+	protected ColGroup(CompressionType type, int[] colIndices, int numRows) {
+		_compType = type;
+		_colIndexes = colIndices;
+		_numRows = numRows;
+	}
+
+	/** Convenience constructor for converting indices to a more compact format. */
+	protected ColGroup(CompressionType type, List<Integer> colIndicesList, int numRows) {
+		_compType = type;
+		_colIndexes = new int[colIndicesList.size()];
+		int i = 0;
+		for (Integer index : colIndicesList)
+			_colIndexes[i++] = index;
+	}
+
+	/**
+	 * @return offsets of the columns in the matrix block that make up the group
+	 */
+	public int[] getColIndices() {
+		return _colIndexes;
+	}
+
+	/**
+	 * @param col
+	 *            an index from 0 to the number of columns in this group - 1
+	 * @return offset of the specified column in the matrix block that make up
+	 *         the group
+	 */
+	public int getColIndex(int colNum) {
+		return _colIndexes[colNum];
+	}
+
+	public int getNumRows() {
+		return _numRows;
+	}
+	
+	/**
+	 * @return number of columns in this column group
+	 */
+	public int getNumCols() {
+		return _colIndexes.length;
+	}
+
+	/**
+	 * @return How the elements of the column group are compressed.
+	 */
+	public CompressionType getCompType() {
+		return _compType;
+	}
+
+	/**
+	 * 
+	 * @param offset
+	 */
+	public void shiftColIndices(int offset)  {
+		for( int i=0; i<_colIndexes.length; i++ )
+			_colIndexes[i] += offset;
+	}
+	
+	/**
+	 * Note: Must be overridden by child classes to account for additional data
+	 * and metadata
+	 * 
+	 * @return an upper bound on the number of bytes used to store this ColGroup
+	 *         in memory.
+	 */
+	public long estimateInMemorySize() {
+		// int numRows (4B) , array reference colIndices (8B) + array object
+		// overhead if exists (32B) + 4B per element, CompressionType compType
+		// (2 booleans 2B + enum overhead 32B + reference to enum 8B)
+		long size = 54;
+		if (_colIndexes == null)
+			return size;
+		else
+			return size + 32 + 4 * _colIndexes.length;
+	}
+
+	/**
+	 * Decompress the contents of this column group into the specified full
+	 * matrix block.
+	 * 
+	 * @param target
+	 *            a matrix block where the columns covered by this column group
+	 *            have not yet been filled in.
+	 */
+	public abstract void decompressToBlock(MatrixBlock target);
+
+	/**
+	 * Decompress the contents of this column group into uncompressed packed
+	 * columns
+	 * 
+	 * @param target
+	 *            a dense matrix block. The block must have enough space to hold
+	 *            the contents of this column group.
+	 * @param colIndexTargets
+	 *            array that maps column indices in the original matrix block to
+	 *            columns of target.
+	 */
+	public abstract void decompressToBlock(MatrixBlock target, int[] colIndexTargets);
+
+	/**
+	 * 
+	 * @param target  dense output vector
+	 * @param colpos  column to decompress, error if larger or equal numCols
+	 */
+	public abstract void decompressToBlock(MatrixBlock target, int colpos);
+
+
+	/**
+	 * Serializes column group to data output.
+	 * 
+	 * @param out
+	 * @throws IOException
+	 */
+	public abstract void write(DataOutput out) 
+		throws IOException;
+	
+	/**
+	 * Deserializes column group from data input.
+	 * 
+	 * @param in
+	 * @throws IOException
+	 */
+	public abstract void readFields(DataInput in) 
+		throws IOException;
+		
+	
+	/**
+	 * Returns the exact serialized size of column group.
+	 * This can be used for example for buffer preallocation.
+	 * 
+	 * @return
+	 */
+	public abstract long getExactSizeOnDisk();
+	
+	/**
+	 * Multiply the slice of the matrix that this column group represents by a
+	 * vector on the right.
+	 * 
+	 * @param vector
+	 *            vector to multiply by (tall vector)
+	 * @param result
+	 *            accumulator for holding the result
+	 * @throws DMLRuntimeException
+	 *             if the internal SystemML code that performs the
+	 *             multiplication experiences an error
+	 */
+	public abstract void rightMultByVector(MatrixBlock vector,
+			MatrixBlock result, int rl, int ru) throws DMLRuntimeException;
+
+
+	/**
+	 * Multiply the slice of the matrix that this column group represents by a
+	 * row vector on the left (the original column vector is assumed to be
+	 * transposed already i.e. its size now is 1xn).
+	 * 
+	 * @param vector
+	 * @param result
+	 * @throws DMLRuntimeException 
+	 */
+	public abstract void leftMultByRowVector(MatrixBlock vector,
+			MatrixBlock result) throws DMLRuntimeException;
+
+	/**
+	 * Perform the specified scalar operation directly on the compressed column
+	 * group, without decompressing individual cells if possible.
+	 * 
+	 * @param op
+	 *            operation to perform
+	 * @return version of this column group with the operation applied
+	 */
+	public abstract ColGroup scalarOperation(ScalarOperator op)
+			throws DMLRuntimeException;
+
+	/**
+	 * 
+	 * @param op
+	 * @param result
+	 * @throws DMLUnsupportedOperationException
+	 * @throws DMLRuntimeException
+	 */
+	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result)
+		throws DMLRuntimeException;
+	
+	/**
+	 * 
+	 * @param rnnz
+	 */
+	protected abstract void countNonZerosPerRow(int[] rnnz);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
new file mode 100644
index 0000000..5d60a49
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
@@ -0,0 +1,498 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.Map.Entry;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+
+/**
+ * Base class for column groups encoded with various types of bitmap encoding.
+ * 
+ * 
+ * NOTES:
+ *  * OLE: separate storage segment length and bitmaps led to a 30% improvement
+ *    but not applied because more difficult to support both data layouts at the
+ *    same time (distributed/local as well as w/ and w/o low-level opt)
+ */
+public abstract class ColGroupBitmap extends ColGroup 
+{
+	private static final long serialVersionUID = -1635828933479403125L;
+	
+	public static final boolean LOW_LEVEL_OPT = true;	
+	//sorting of values by physical length helps by 10-20%, especially for serial, while
+	//slight performance decrease for parallel incl multi-threaded, hence not applied for
+	//distributed operations (also because compression time + garbage collection increases)
+	private static final boolean SORT_VALUES_BY_LENGTH = true; 
+	protected static final boolean CREATE_SKIPLIST = true;
+	
+	protected static final int READ_CACHE_BLKSZ = 2 * BitmapEncoder.BITMAP_BLOCK_SZ;
+	protected static final int WRITE_CACHE_BLKSZ = 2 * BitmapEncoder.BITMAP_BLOCK_SZ;
+	
+	/** Distinct values associated with individual bitmaps. */
+	protected double[] _values; //linearized <numcol vals> <numcol vals>
+
+	/** Bitmaps, one per uncompressed value in {@link #values}. */
+	protected int[] _ptr; //bitmap offsets per value
+	protected char[] _data; //linearized bitmaps (variable length)
+	
+	protected int[] _skiplist;
+	
+	public ColGroupBitmap(CompressionType type) {
+		super(type, (int[]) null, -1);
+	}
+	
+	/**
+	 * Main constructor. Stores the headers for the individual bitmaps.
+	 * 
+	 * @param colIndices
+	 *            indices (within the block) of the columns included in this
+	 *            column
+	 * @param numRows
+	 *            total number of rows in the parent block
+	 * @param ubm
+	 *            Uncompressed bitmap representation of the block
+	 */
+	public ColGroupBitmap(CompressionType type, int[] colIndices, int numRows, UncompressedBitmap ubm) 
+	{
+		super(type, colIndices, numRows);
+
+		// Extract and store just the distinct values. The bitmaps themselves go
+		// into the subclasses.
+		final int numCols = ubm.getNumColumns();
+		final int numVals = ubm.getNumValues();
+		
+		_values = new double[numVals*numCols];
+		for (int i=0; i<numVals; i++) {
+			//note: deep copied internally on getValues
+			double[] tmp = ubm.getValues(i);
+			System.arraycopy(tmp, 0, _values, i*numCols, numCols);
+		}
+	}
+
+	/**
+	 * Constructor for subclass methods that need to create shallow copies
+	 * 
+	 * @param colIndices
+	 *            raw column index information
+	 * @param numRows
+	 *            number of rows in the block
+	 * @param values
+	 *            set of distinct values for the block (associated bitmaps are
+	 *            kept in the subclass)
+	 */
+	protected ColGroupBitmap(CompressionType type, int[] colIndices, int numRows, double[] values) {
+		super(type, colIndices, numRows);
+		_values = values;
+	}
+	
+	protected final int len(int k) {
+		return _ptr[k+1] - _ptr[k];
+	}
+
+	/**
+	 * 
+	 * @param numVals
+	 * @param totalLen
+	 * @param lbitmaps
+	 */
+	protected void createCompressedBitmaps(int numVals, int totalLen, char[][] lbitmaps)
+	{
+		// compact bitmaps to linearized representation
+		if( LOW_LEVEL_OPT && SORT_VALUES_BY_LENGTH
+			&& _numRows > BitmapEncoder.BITMAP_BLOCK_SZ ) 
+		{
+			// sort value by num segments in descending order
+			TreeMap<Integer,ArrayList<Integer>> tree = new TreeMap<Integer, ArrayList<Integer>>();
+			for( int i=0; i<numVals; i++ ) {
+				int revlen = totalLen-lbitmaps[i].length;
+				if( !tree.containsKey(revlen) )
+					tree.put(revlen, new ArrayList<Integer>());
+				tree.get(revlen).add(i);
+			}
+			
+			// compact bitmaps to linearized representation
+			_ptr = new int[numVals+1];
+			_data = new char[totalLen];
+			int pos = 0, off = 0;
+			for( Entry<Integer,ArrayList<Integer>> e : tree.entrySet() ) {
+				for( Integer tmpix : e.getValue() ) {
+					int len = lbitmaps[tmpix].length;
+					_ptr[pos] = off;
+					System.arraycopy(lbitmaps[tmpix], 0, _data, off, len);
+					off += len;
+					pos++;
+				}
+			}
+			_ptr[numVals] = totalLen;
+			
+			// reorder values
+			double[] lvalues = new double[_values.length];
+			int off2 = 0; int numCols = _colIndexes.length;
+			for( Entry<Integer,ArrayList<Integer>> e : tree.entrySet() ) {
+				for( Integer tmpix : e.getValue() ) {
+					System.arraycopy(_values, tmpix*numCols, lvalues, off2, numCols);				
+					off2 += numCols;
+				}
+			}			
+			_values = lvalues;
+		}
+		else
+		{
+			// compact bitmaps to linearized representation
+			_ptr = new int[numVals+1];
+			_data = new char[totalLen];
+			for( int i=0, off=0; i<numVals; i++ ) {
+				int len = lbitmaps[i].length;
+				_ptr[i] = off;
+				System.arraycopy(lbitmaps[i], 0, _data, off, len);
+				off += len;
+			}
+			_ptr[numVals] = totalLen;
+		}
+	}
+	
+	@Override
+	public long estimateInMemorySize() {
+		long size = super.estimateInMemorySize();
+		
+		// adding the size of values
+		size += 8; //array reference
+		if (_values != null) {
+			size += 32 + _values.length * 8; //values
+		}
+		
+		// adding bitmaps size
+		size += 16; //array references
+		if (_data != null) {
+			size += 32 + _ptr.length * 4; // offsets
+			size += 32 + _data.length * 2;    // bitmaps
+		}
+	
+		return size;
+	}
+
+	//generic decompression for OLE/RLE, to be overwritten for performance
+	@Override
+	public void decompressToBlock(MatrixBlock target) 
+	{
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		int[] colIndices = getColIndices();
+		
+		// Run through the bitmaps for this column group
+		for (int i = 0; i < numVals; i++) {
+			Iterator<Integer> decoder = getDecodeIterator(i);
+			int valOff = i*numCols;
+
+			while (decoder.hasNext()) {
+				int row = decoder.next();
+				for (int colIx = 0; colIx < numCols; colIx++) {
+					target.appendValue(row, colIndices[colIx], _values[valOff+colIx]);
+				}
+			}
+		}
+	}
+
+	//generic decompression for OLE/RLE, to be overwritten for performance
+	@Override
+	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) 
+	{
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		
+		// Run through the bitmaps for this column group
+		for (int i = 0; i < numVals; i++) {
+			Iterator<Integer> decoder = getDecodeIterator(i);
+			int valOff = i*numCols;
+
+			while (decoder.hasNext()) {
+				int row = decoder.next();
+				for (int colIx = 0; colIx < numCols; colIx++) {
+					int origMatrixColIx = getColIndex(colIx);
+					int targetColIx = colIndexTargets[origMatrixColIx];
+					target.quickSetValue(row, targetColIx, _values[valOff+colIx]);
+				}
+			}
+		}
+	}
+	
+	//generic decompression for OLE/RLE, to be overwritten for performance
+	@Override
+	public void decompressToBlock(MatrixBlock target, int colpos) 
+	{
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		
+		// Run through the bitmaps for this column group
+		for (int i = 0; i < numVals; i++) {
+			Iterator<Integer> decoder = getDecodeIterator(i);
+			int valOff = i*numCols;
+
+			while (decoder.hasNext()) {
+				int row = decoder.next();
+				target.quickSetValue(row, 0, _values[valOff+colpos]);
+			}
+		}
+	}
+	
+	/**
+	 * 
+	 * @param bitmapIx
+	 * @return
+	 */
+	protected double sumValues(int bitmapIx)
+	{
+		final int numCols = getNumCols();
+		final int valOff = bitmapIx * numCols;
+		
+		double val = 0.0;
+		for( int i = 0; i < numCols; i++ ) {
+			val += _values[valOff+i];
+		}
+		
+		return val;
+	}
+	
+	protected double sumValues(int bitmapIx, double[] b)
+	{
+		final int numCols = getNumCols();
+		final int valOff = bitmapIx * numCols;
+		
+		double val = 0;
+		for( int i = 0; i < numCols; i++ ) {
+			val += _values[valOff+i] * b[i];
+		}
+		
+		return val;
+	}
+	
+	/**
+	 * 
+	 * @param b
+	 * @param c
+	 */
+	protected void sumAllValues(double[] b, double[] c)
+	{
+		final int numVals = getNumValues();
+		final int numCols = getNumCols();
+		
+		//vectMultiplyAdd over cols instead of dotProduct over vals because
+		//usually more values than columns
+		for( int i=0, off=0; i<numCols; i++, off+=numVals )
+			LinearAlgebraUtils.vectMultiplyAdd(b[i], _values, c, off, 0, numVals);
+	}
+
+	/**
+	 * Method for use by subclasses. Applies a scalar operation to the value
+	 * metadata stored in the superclass.
+	 * 
+	 * @param op
+	 *            scalar operation to perform
+	 * @return transformed copy of value metadata for this column group
+	 * @throws DMLRuntimeException
+	 */
+	protected double[] applyScalarOp(ScalarOperator op)
+			throws DMLRuntimeException 
+	{
+		//scan over linearized values
+		double[] ret = new double[_values.length];
+		for (int i = 0; i < _values.length; i++) {
+			ret[i] = op.executeScalar(_values[i]);
+		}
+
+		return ret;
+	}
+	
+	/**
+	 * 
+	 * @param op
+	 * @param newVal
+	 * @param numCols
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	protected double[] applyScalarOp(ScalarOperator op, double newVal, int numCols)
+			throws DMLRuntimeException 
+	{
+		//scan over linearized values
+		double[] ret = new double[_values.length + numCols];
+		for( int i = 0; i < _values.length; i++ ) {
+			ret[i] = op.executeScalar(_values[i]);
+		}
+		
+		//add new value to the end
+		Arrays.fill(ret, _values.length, _values.length+numCols, newVal);
+		
+		return ret;
+	}
+
+	/**
+	 * @return the number of distinct sets of values associated with the bitmaps
+	 *         in this column group
+	 */
+	public int getNumValues() {
+		return _values.length / _colIndexes.length;
+	}
+
+	/**
+	 * 
+	 * @return
+	 */
+	public double[] getValues() {
+		return _values;
+	}
+	
+	/**
+	 * 
+	 * @return
+	 */
+	public char[] getBitmaps() {
+		return _data;
+	}
+	
+	public int[] getBitmapOffsets() {
+		return _ptr;
+	}
+
+	/**
+	 * @param bmpIx
+	 *            index of a specific compressed bitmap (stored in subclass,
+	 *            index same as {@link #values})
+	 * @return an object for iterating over the row offsets in this bitmap. Only
+	 *         valid until the next call to this method. May be reused across
+	 *         calls.
+	 */
+	public abstract Iterator<Integer> getDecodeIterator(int bmpIx);
+
+	/**
+	 * Utility function of sparse-unsafe operations.
+	 * 
+	 * @param ind
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	protected int[] computeOffsets(boolean[] ind)
+		throws DMLRuntimeException 
+	{
+		//determine number of offsets
+		int numOffsets = 0;
+		for( int i=0; i<ind.length; i++ )
+			numOffsets += ind[i] ? 1 : 0;
+		
+		//create offset lists
+		int[] ret = new int[numOffsets];
+		for( int i=0, pos=0; i<ind.length; i++ )
+			if( ind[i] )
+				ret[pos++] = i;
+		
+		return ret;
+	}
+
+	@Override
+	public void readFields(DataInput in) 
+		throws IOException 
+	{
+		_numRows = in.readInt();
+		int numCols = in.readInt();
+		int numVals = in.readInt();
+		
+		//read col indices
+		_colIndexes = new int[ numCols ];
+		for( int i=0; i<numCols; i++ )
+			_colIndexes[i] = in.readInt();
+		
+		//read distinct values
+		_values = new double[numVals*numCols];
+		for( int i=0; i<numVals*numCols; i++ )
+			_values[i] = in.readDouble();
+		
+		//read bitmaps
+		int totalLen = in.readInt();
+		_ptr = new int[numVals+1];
+		_data = new char[totalLen];		
+		for( int i=0, off=0; i<numVals; i++ ) {
+			int len = in.readInt();
+			_ptr[i] = off;
+			for( int j=0; j<len; j++ )
+				_data[off+j] = in.readChar();
+			off += len;
+		}
+		_ptr[numVals] = totalLen;
+	}
+	
+	@Override
+	public void write(DataOutput out) 
+		throws IOException 
+	{
+		int numCols = getNumCols();
+		int numVals = getNumValues();
+		out.writeInt(_numRows);
+		out.writeInt(numCols);
+		out.writeInt(numVals);
+		
+		//write col indices
+		for( int i=0; i<_colIndexes.length; i++ )
+			out.writeInt( _colIndexes[i] );
+		
+		//write distinct values
+		for( int i=0; i<_values.length; i++ )
+			out.writeDouble(_values[i]);
+
+		//write bitmaps (lens and data, offset later recreated)
+		int totalLen = 0;
+		for( int i=0; i<numVals; i++ )
+			totalLen += len(i);
+		out.writeInt(totalLen);	
+		for( int i=0; i<numVals; i++ ) {
+			int len = len(i);
+			int off = _ptr[i];
+			out.writeInt(len);
+			for( int j=0; j<len; j++ )
+				out.writeChar(_data[off+j]);
+		}
+	}
+
+	@Override
+	public long getExactSizeOnDisk() {
+		long ret = 12; //header
+		//col indices
+		ret += 4 * _colIndexes.length; 
+		//distinct values (groups of values)
+		ret += 8 * _values.length;
+		//actual bitmaps
+		ret += 4; //total length
+		for( int i=0; i<getNumValues(); i++ )
+			ret += 4 + 2 * len(i);
+		
+		return ret;
+	}
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
new file mode 100644
index 0000000..a2b493a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -0,0 +1,634 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.ConverterUtils;
+import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
+import org.apache.sysml.runtime.functionobjects.KahanFunction;
+import org.apache.sysml.runtime.functionobjects.KahanPlus;
+import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
+import org.apache.sysml.runtime.functionobjects.ReduceAll;
+import org.apache.sysml.runtime.functionobjects.ReduceCol;
+import org.apache.sysml.runtime.functionobjects.ReduceRow;
+import org.apache.sysml.runtime.instructions.cp.KahanObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
+ * simple lists of offsets for each set of distinct values.
+ * 
+ */
+public class ColGroupOLE extends ColGroupBitmap 
+{
+	private static final long serialVersionUID = -9157676271360528008L;
+
+	public ColGroupOLE() {
+		super(CompressionType.OLE_BITMAP);
+	}
+	
+	/**
+	 * Main constructor. Constructs and stores the necessary bitmaps.
+	 * 
+	 * @param colIndices
+	 *            indices (within the block) of the columns included in this
+	 *            column
+	 * @param numRows
+	 *            total number of rows in the parent block
+	 * @param ubm
+	 *            Uncompressed bitmap representation of the block
+	 */
+	public ColGroupOLE(int[] colIndices, int numRows, UncompressedBitmap ubm) 
+	{
+		super(CompressionType.OLE_BITMAP, colIndices, numRows, ubm);
+
+		// compress the bitmaps
+		final int numVals = ubm.getNumValues();
+		char[][] lbitmaps = new char[numVals][];
+		int totalLen = 0;
+		for( int i=0; i<numVals; i++ ) {
+			lbitmaps[i] = BitmapEncoder.genOffsetBitmap(ubm.getOffsetsList(i));
+			totalLen += lbitmaps[i].length;
+		}
+		
+		// compact bitmaps to linearized representation
+		createCompressedBitmaps(numVals, totalLen, lbitmaps);
+		
+
+		if( LOW_LEVEL_OPT && CREATE_SKIPLIST
+				&& numRows > 2*BitmapEncoder.BITMAP_BLOCK_SZ )
+		{
+			int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+			_skiplist = new int[numVals];
+			int rl = (getNumRows()/2/blksz)*blksz;
+			for (int k = 0; k < numVals; k++) {
+				int bitmapOff = _ptr[k];
+				int bitmapLen = len(k);
+				int bufIx = 0;
+				for( int i=0; i<rl && bufIx<bitmapLen; i+=blksz ) {
+					bufIx += _data[bitmapOff+bufIx] + 1;
+				}
+				_skiplist[k] = bufIx;
+			}		
+		}
+	}
+
+	/**
+	 * Constructor for internal use.
+	 */
+	public ColGroupOLE(int[] colIndices, int numRows, double[] values, char[] bitmaps, int[] bitmapOffs) {
+		super(CompressionType.OLE_BITMAP, colIndices, numRows, values);
+		_data = bitmaps;
+		_ptr = bitmapOffs;
+	}
+
+	@Override
+	public Iterator<Integer> getDecodeIterator(int bmpIx) {
+		return new BitmapDecoderOLE(_data, _ptr[bmpIx], len(bmpIx));
+	}
+	
+	@Override
+	public void decompressToBlock(MatrixBlock target) 
+	{
+		if( LOW_LEVEL_OPT && getNumValues() > 1 )
+		{
+			final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+			final int numCols = getNumCols();
+			final int numVals = getNumValues();
+			final int n = getNumRows();
+			
+			//cache blocking config and position array
+			int[] apos = new int[numVals];
+					
+			//cache conscious append via horizontal scans 
+			for( int bi=0; bi<n; bi+=blksz ) {
+				for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);					
+					int bufIx = apos[k];
+					if( bufIx >= bitmapLen ) 
+						continue;
+					int len = _data[bitmapOff+bufIx];
+					int pos = bitmapOff+bufIx+1;
+					for( int i=pos; i<pos+len; i++ )
+						for( int j=0, rix = bi+_data[i]; j<numCols; j++ )
+							if( _values[off+j]!=0 )
+								target.appendValue(rix, _colIndexes[j], _values[off+j]);
+					apos[k] += len + 1;
+				}
+			}		
+		}
+		else
+		{
+			//call generic decompression with decoder
+			super.decompressToBlock(target);
+		}
+	}
+
+	@Override
+	public void decompressToBlock(MatrixBlock target, int[] colixTargets) 
+	{
+		if( LOW_LEVEL_OPT && getNumValues() > 1 )
+		{
+			final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+			final int numCols = getNumCols();
+			final int numVals = getNumValues();
+			final int n = getNumRows();
+			
+			//cache blocking config and position array
+			int[] apos = new int[numVals];					
+			int[] cix = new int[numCols];
+			
+			//prepare target col indexes
+			for( int j=0; j<numCols; j++ )
+				cix[j] = colixTargets[_colIndexes[j]];
+			
+			//cache conscious append via horizontal scans 
+			for( int bi=0; bi<n; bi+=blksz ) {
+				for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);					
+					int bufIx = apos[k];
+					if( bufIx >= bitmapLen ) 
+						continue;
+					int len = _data[bitmapOff+bufIx];
+					int pos = bitmapOff+bufIx+1;
+					for( int i=pos; i<pos+len; i++ )
+						for( int j=0, rix = bi+_data[i]; j<numCols; j++ )
+							if( _values[off+j]!=0 )
+								target.appendValue(rix, cix[j], _values[off+j]);
+					apos[k] += len + 1;
+				}
+			}		
+		}
+		else
+		{
+			//call generic decompression with decoder
+			super.decompressToBlock(target, colixTargets);
+		}
+	}
+	
+	@Override
+	public void decompressToBlock(MatrixBlock target, int colpos) 
+	{
+		if( LOW_LEVEL_OPT && getNumValues() > 1 )
+		{
+			final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+			final int numCols = getNumCols();
+			final int numVals = getNumValues();
+			final int n = getNumRows();
+			double[] c = target.getDenseBlock();
+			
+			//cache blocking config and position array
+			int[] apos = new int[numVals];					
+			
+			//cache conscious append via horizontal scans 
+			for( int bi=0; bi<n; bi+=blksz ) {
+				for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);					
+					int bufIx = apos[k];
+					if( bufIx >= bitmapLen ) 
+						continue;
+					int len = _data[bitmapOff+bufIx];
+					int pos = bitmapOff+bufIx+1;
+					for( int i=pos; i<pos+len; i++ ) {
+						c[bi+_data[i]] = _values[off+colpos];
+					}
+					apos[k] += len + 1;
+				}
+			}	
+			
+			target.recomputeNonZeros();
+		}
+		else
+		{
+			//call generic decompression with decoder
+			super.decompressToBlock(target, colpos);
+		}
+	}
+	
+	@Override
+	public ColGroup scalarOperation(ScalarOperator op)
+		throws DMLRuntimeException 
+	{
+		double val0 = op.executeScalar(0);
+		
+		//fast path: sparse-safe operations
+		// Note that bitmaps don't change and are shallow-copied
+		if( op.sparseSafe || val0==0 ) {
+			return new ColGroupOLE(_colIndexes, _numRows, 
+					applyScalarOp(op), _data, _ptr);
+		}
+		
+		//slow path: sparse-unsafe operations (potentially create new bitmap)
+		//note: for efficiency, we currently don't drop values that become 0
+		boolean[] lind = computeZeroIndicatorVector();
+		int[] loff = computeOffsets(lind);
+		if( loff.length==0 ) { //empty offset list: go back to fast path
+			return new ColGroupOLE(_colIndexes, _numRows, 
+					applyScalarOp(op), _data, _ptr);
+		}
+		
+		double[] rvalues = applyScalarOp(op, val0, getNumCols());		
+		char[] lbitmap = BitmapEncoder.genOffsetBitmap(loff);
+		char[] rbitmaps = Arrays.copyOf(_data, _data.length+lbitmap.length);
+		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
+		int[] rbitmapOffs = Arrays.copyOf(_ptr, _ptr.length+1);
+		rbitmapOffs[rbitmapOffs.length-1] = rbitmaps.length; 
+		
+		return new ColGroupOLE(_colIndexes, _numRows, 
+				rvalues, rbitmaps, rbitmapOffs);
+	}
+
+	@Override
+	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru)
+			throws DMLRuntimeException 
+	{
+		double[] b = ConverterUtils.getDenseVector(vector);
+		double[] c = result.getDenseBlock();
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		
+		//prepare reduced rhs w/ relevant values
+		double[] sb = new double[numCols];
+		for (int j = 0; j < numCols; j++) {
+			sb[j] = b[_colIndexes[j]];
+		}
+		
+		if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz )
+		{
+			//since single segment scans already exceed typical L2 cache sizes
+			//and because there is some overhead associated with blocking, the
+			//best configuration aligns with L3 cache size (x*vcores*64K*8B < L3)
+			//x=4 leads to a good yet slightly conservative compromise for single-/
+			//multi-threaded and typical number of cores and L3 cache sizes
+			final int blksz2 = ColGroupBitmap.WRITE_CACHE_BLKSZ;
+			
+			//step 1: prepare position and value arrays
+			
+			//current pos / values per OLs
+			int[] apos = new int[numVals];
+			double[] aval = new double[numVals];
+			
+			//skip-scan to beginning for all OLs 
+			if( rl > 0 ) { //rl aligned with blksz		
+				int rskip = (getNumRows()/2/blksz)*blksz;
+				
+				for (int k = 0; k < numVals; k++) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);
+					int start = (rl>=rskip)?rskip:0;
+					int bufIx = (rl>=rskip)?_skiplist[k]:0;
+					for( int i=start; i<rl && bufIx<bitmapLen; i+=blksz ) {
+						bufIx += _data[bitmapOff+bufIx] + 1;
+					}
+					apos[k] = bufIx;
+				}
+			}
+			
+			//pre-aggregate values per OLs
+			for( int k = 0; k < numVals; k++ )
+				aval[k] = sumValues(k, sb);
+					
+			//step 2: cache conscious matrix-vector via horizontal scans 
+			for( int bi=rl; bi<ru; bi+=blksz2 ) 
+			{
+				int bimax = Math.min(bi+blksz2, ru);
+				
+				//horizontal segment scan, incl pos maintenance
+				for (int k = 0; k < numVals; k++) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);
+					double val = aval[k];
+					int bufIx = apos[k];
+					
+					for( int ii=bi; ii<bimax && bufIx<bitmapLen; ii+=blksz ) {
+						//prepare length, start, and end pos
+						int len = _data[bitmapOff+bufIx];
+						int pos = bitmapOff+bufIx+1;
+						
+						//compute partial results
+						//LinearAlgebraUtils.vectAdd(val, c, bitmaps, pos, ii, len);
+						for( int i=pos; i<pos+len; i++ )
+							c[ii + _data[i]] += val;	
+						bufIx += len + 1;
+					}
+
+					apos[k] = bufIx;
+				}
+			}		
+		}
+		else
+		{	
+			//iterate over all values and their bitmaps
+			for (int k = 0; k < numVals; k++) 
+			{
+				//prepare value-to-add for entire value bitmap
+				int bitmapOff = _ptr[k];
+				int bitmapLen = len(k);
+				double val = sumValues(k, sb);
+				
+				//iterate over bitmap blocks and add values
+				if (val != 0) {
+					int offsetBase = 0;
+					int bufIx = 0;
+					int curBlckLen;
+					
+					//scan to beginning offset if necessary 
+					if( rl > 0 ){
+						for (; bufIx<bitmapLen & offsetBase<rl; bufIx += curBlckLen + 1, offsetBase += blksz) {
+							curBlckLen = _data[bitmapOff+bufIx];
+						}	
+					}
+					
+					//compute partial results
+					for (; bufIx<bitmapLen & offsetBase<ru; bufIx += curBlckLen + 1, offsetBase += blksz) {
+						curBlckLen = _data[bitmapOff+bufIx];
+						for (int blckIx = 1; blckIx <= curBlckLen; blckIx++) {
+							c[offsetBase + _data[bitmapOff+bufIx + blckIx]] += val;
+						}
+					}
+				}
+			}
+		}
+	}
+
+	@Override
+	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result)
+		throws DMLRuntimeException 
+	{
+		double[] a = ConverterUtils.getDenseVector(vector);
+		double[] c = result.getDenseBlock();
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		final int n = getNumRows();
+		
+		if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz )
+		{
+			//cache blocking config (see matrix-vector mult for explanation)
+			final int blksz2 = ColGroupBitmap.READ_CACHE_BLKSZ;
+			
+			//step 1: prepare position and value arrays
+			
+			//current pos per OLs / output values
+			int[] apos = new int[numVals];
+			double[] cvals = new double[numVals];
+			
+			//step 2: cache conscious matrix-vector via horizontal scans 
+			for( int ai=0; ai<n; ai+=blksz2 ) 
+			{
+				int aimax = Math.min(ai+blksz2, n);
+				
+				//horizontal segment scan, incl pos maintenance
+				for (int k = 0; k < numVals; k++) {
+					int bitmapOff = _ptr[k];
+					int bitmapLen = len(k);
+					int bufIx = apos[k];
+					double vsum = 0;	
+					
+					for( int ii=ai; ii<aimax && bufIx<bitmapLen; ii+=blksz ) {
+						//prepare length, start, and end pos
+						int len = _data[bitmapOff+bufIx];
+						int pos = bitmapOff+bufIx+1;
+						
+						//iterate over bitmap blocks and compute partial results (a[i]*1)
+						vsum += LinearAlgebraUtils.vectSum(a, _data, ii, pos, len);
+						bufIx += len + 1;
+					}
+
+					apos[k] = bufIx;
+					cvals[k] += vsum;
+				}
+			}
+			
+			//step 3: scale partial results by values and write to global output
+			for (int k = 0, valOff=0; k < numVals; k++, valOff+=numCols)
+				for( int j = 0; j < numCols; j++ )
+					c[ _colIndexes[j] ] += cvals[k] * _values[valOff+j];		
+		}
+		else
+		{
+			//iterate over all values and their bitmaps
+			for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) 
+			{
+				int bitmapOff = _ptr[k];
+				int bitmapLen = len(k);
+				
+				//iterate over bitmap blocks and add partial results
+				double vsum = 0;
+				for (int bix=0, off=0; bix < bitmapLen; bix+=_data[bitmapOff+bix]+1, off+=blksz)
+					vsum += LinearAlgebraUtils.vectSum(a, _data, off, bitmapOff+bix+1, _data[bitmapOff+bix]);
+				
+				//scale partial results by values and write results
+				for( int j = 0; j < numCols; j++ )
+					c[ _colIndexes[j] ] += vsum * _values[ valOff+j ];
+			}
+		}
+	}
+
+	@Override
+	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result) 
+		throws DMLRuntimeException 
+	{
+		KahanFunction kplus = (op.aggOp.increOp.fn instanceof KahanPlus) ?
+				KahanPlus.getKahanPlusFnObject() : KahanPlusSq.getKahanPlusSqFnObject();
+		
+		if( op.indexFn instanceof ReduceAll )
+			computeSum(result, kplus);
+		else if( op.indexFn instanceof ReduceCol )
+			computeRowSums(result, kplus);
+		else if( op.indexFn instanceof ReduceRow )
+			computeColSums(result, kplus);
+	}
+	
+	/**
+	 * 
+	 * @param result
+	 */
+	private void computeSum(MatrixBlock result, KahanFunction kplus)
+	{
+		KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1));
+		
+		//iterate over all values and their bitmaps
+		final int numVals = getNumValues();
+		final int numCols = getNumCols();
+		
+		for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+		{
+			int bitmapOff = _ptr[bitmapIx];
+			int bitmapLen = len(bitmapIx);
+			int valOff = bitmapIx * numCols;
+			
+			//iterate over bitmap blocks and count partial lengths
+			int count = 0;
+			for (int bix=0; bix < bitmapLen; bix+=_data[bitmapOff+bix]+1)
+				count += _data[bitmapOff+bix];
+			
+			//scale counts by all values
+			for( int j = 0; j < numCols; j++ )
+				kplus.execute3(kbuff, _values[ valOff+j ], count);
+		}
+		
+		result.quickSetValue(0, 0, kbuff._sum);
+		result.quickSetValue(0, 1, kbuff._correction);
+	}
+	
+	/**
+	 * 
+	 * @param result
+	 */
+	private void computeRowSums(MatrixBlock result, KahanFunction kplus)
+	{
+		KahanObject kbuff = new KahanObject(0, 0);
+	
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numVals = getNumValues();
+		
+		//iterate over all values and their bitmaps
+		for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+		{
+			//prepare value-to-add for entire value bitmap
+			int bitmapOff = _ptr[bitmapIx];
+			int bitmapLen = len(bitmapIx);
+			double val = sumValues(bitmapIx);
+			
+			//iterate over bitmap blocks and add values
+			if (val != 0) {
+				int offsetBase = 0;
+				int bufIx = 0;
+				int curBlckLen;
+				for (; bufIx < bitmapLen; bufIx += curBlckLen + 1, offsetBase += blksz) {
+					curBlckLen = _data[bitmapOff+bufIx];
+					for (int blckIx = 1; blckIx <= curBlckLen; blckIx++) {
+						int rix = offsetBase + _data[bitmapOff+bufIx + blckIx];
+						kbuff.set(result.quickGetValue(rix, 0), result.quickGetValue(rix, 1));
+						kplus.execute2(kbuff, val);
+						result.quickSetValue(rix, 0, kbuff._sum);
+						result.quickSetValue(rix, 1, kbuff._correction);
+					}
+				}
+			}
+		}
+	}
+	
+	/**
+	 * 
+	 * @param result
+	 */
+	private void computeColSums(MatrixBlock result, KahanFunction kplus)
+	{
+		KahanObject kbuff = new KahanObject(0, 0);
+		
+		//iterate over all values and their bitmaps
+		final int numVals = getNumValues();
+		final int numCols = getNumCols();
+		for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+		{
+			int bitmapOff = _ptr[bitmapIx];
+			int bitmapLen = len(bitmapIx);
+			int valOff = bitmapIx * numCols;
+			
+			//iterate over bitmap blocks and count partial lengths
+			int count = 0;
+			for (int bix=0; bix < bitmapLen; bix+=_data[bitmapOff+bix]+1)
+				count += _data[bitmapOff+bix];
+			
+			//scale counts by all values
+			for( int j = 0; j < numCols; j++ ) {
+				kbuff.set(result.quickGetValue(0, _colIndexes[j]),result.quickGetValue(1, _colIndexes[j]));
+				kplus.execute3(kbuff, _values[ valOff+j ], count);
+				result.quickSetValue(0, _colIndexes[j], kbuff._sum);
+				result.quickSetValue(1, _colIndexes[j], kbuff._correction);
+			}
+		}
+	}
+	
+	/**
+	 * Utility function of sparse-unsafe operations.
+	 * 
+	 * @return
+	 * @throws DMLRuntimeException
+	 */
+	private boolean[] computeZeroIndicatorVector()
+		throws DMLRuntimeException 
+	{
+		boolean[] ret = new boolean[_numRows];
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numVals = getNumValues();
+		
+		//initialize everything with zero
+		Arrays.fill(ret, true);
+		
+		//iterate over all values and their bitmaps
+		for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+		{
+			//prepare value-to-add for entire value bitmap
+			int bitmapOff = _ptr[bitmapIx];
+			int bitmapLen = len(bitmapIx);
+			
+			//iterate over bitmap blocks and add values
+			int offsetBase = 0;
+			int bufIx = 0;
+			int curBlckLen;
+			for (; bufIx < bitmapLen; bufIx += curBlckLen + 1, offsetBase += blksz) {
+				curBlckLen = _data[bitmapOff+bufIx];
+				for (int blckIx = 1; blckIx <= curBlckLen; blckIx++) {
+					ret[offsetBase + _data[bitmapOff+bufIx + blckIx]] &= false;
+				}
+			}
+		}
+		
+		return ret;
+	}
+	
+	@Override
+	protected void countNonZerosPerRow(int[] rnnz)
+	{
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numVals = getNumValues();
+		final int numCols = getNumCols();
+		
+		//iterate over all values and their bitmaps
+		for (int k = 0; k < numVals; k++) 
+		{
+			//prepare value-to-add for entire value bitmap
+			int bitmapOff = _ptr[k];
+			int bitmapLen = len(k);
+			
+			//iterate over bitmap blocks and add values
+			int offsetBase = 0;
+			int curBlckLen;
+			for (int bufIx=0; bufIx<bitmapLen; bufIx+=curBlckLen+1, offsetBase+=blksz) {
+				curBlckLen = _data[bitmapOff+bufIx];
+				for (int blckIx = 1; blckIx <= curBlckLen; blckIx++) {
+					rnnz[offsetBase + _data[bitmapOff+bufIx + blckIx]] += numCols;
+				}
+			}
+		}
+	}
+}