You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/04/17 01:57:24 UTC

incubator-systemml git commit: [SYSTEMML-1530] Performance compressed tsmm (ddc decomp, alloc, result)

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 499428072 -> e1ae4a9f9


[SYSTEMML-1530] Performance compressed tsmm (ddc decomp, alloc, result)

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e1ae4a9f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e1ae4a9f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e1ae4a9f

Branch: refs/heads/master
Commit: e1ae4a9f93cad3f513ce3de8597cec29b7263205
Parents: 4994280
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sun Apr 16 15:54:36 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sun Apr 16 18:59:11 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/compress/ColGroupDDC1.java    | 51 +++++++--------
 .../sysml/runtime/compress/ColGroupDDC2.java    | 10 +++
 .../sysml/runtime/compress/ColGroupOLE.java     | 57 +++++++----------
 .../sysml/runtime/compress/ColGroupRLE.java     | 67 +++++++++-----------
 .../runtime/compress/CompressedMatrixBlock.java | 36 +++++++----
 .../compress/utils/LinearAlgebraUtils.java      | 25 +++-----
 .../runtime/matrix/data/LibMatrixMult.java      |  2 +-
 7 files changed, 118 insertions(+), 130 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
index 82adb55..1ff07ac 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
@@ -169,6 +169,16 @@ public class ColGroupDDC1 extends ColGroupDDC
 	}
 	
 	@Override
+	public void decompressToBlock(MatrixBlock target, int colpos) {
+		int nrow = getNumRows();
+		int ncol = getNumCols();
+		double[] c = target.getDenseBlock();
+		for( int i = 0; i < nrow; i++ )
+			c[i] = _values[(_data[i]&0xFF)*ncol+colpos];
+		target.recomputeNonZeros();
+	}
+	
+	@Override
 	protected void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
 		final int ncol = getNumCols();
 		final int numVals = getNumValues();
@@ -243,36 +253,19 @@ public class ColGroupDDC1 extends ColGroupDDC
 		final int ncol = getNumCols();
 		final int numVals = getNumValues();
 		
-		if( 8*numVals < getNumRows() ) 
-		{
-			//iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
-			//temporary array also avoids false sharing in multi-threaded environments
-			double[] vals = allocDVector(numVals, true);
-			for( int i=0; i<nrow; i++ ) {
-				vals[_data[i]&0xFF] += a[i];
-			}
-			
-			//post-scaling of pre-aggregate with distinct values
-			for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) {
-				double aval = vals[k];
-				for( int j=0; j<ncol; j++ ) {
-					int colIx = _colIndexes[j];
-					c[colIx] += aval * _values[valOff+j];
-				}	
-			}
+		//iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
+		//temporary array also avoids false sharing in multi-threaded environments
+		double[] vals = allocDVector(numVals, true);
+		for( int i=0; i<nrow; i++ ) {
+			vals[_data[i]&0xFF] += a[i];
 		}
-		else //general case
-		{
-			//iterate over codes, compute all, and add to the result
-			for( int i=0; i<nrow; i++ ) {
-				double aval = a[i];
-				if( aval != 0 ) {
-					int valOff = (_data[i]&0xFF) * ncol;
-					for( int j=0; j<ncol; j++ ) {
-						int colIx = _colIndexes[j];
-						c[colIx] += aval * _values[valOff+j];
-					}
-				}
+		
+		//post-scaling of pre-aggregate with distinct values
+		for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) {
+			double aval = vals[k];
+			for( int j=0; j<ncol; j++ ) {
+				int colIx = _colIndexes[j];
+				c[colIx] += aval * _values[valOff+j];
 			}	
 		}
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
index 66ba149..b4de6c3 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
@@ -171,6 +171,16 @@ public class ColGroupDDC2 extends ColGroupDDC
 	}
 	
 	@Override
+	public void decompressToBlock(MatrixBlock target, int colpos) {
+		int nrow = getNumRows();
+		int ncol = getNumCols();
+		double[] c = target.getDenseBlock();
+		for( int i = 0; i < nrow; i++ )
+			c[i] = _values[_data[i]*ncol+colpos];
+		target.recomputeNonZeros();
+	}
+	
+	@Override
 	protected void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
 		final int ncol = getNumCols();
 		final int numVals = getNumValues();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
index 80e53d4..bc1d421 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -201,41 +201,32 @@ public class ColGroupOLE extends ColGroupOffset
 	@Override
 	public void decompressToBlock(MatrixBlock target, int colpos) 
 	{
-		if( LOW_LEVEL_OPT && getNumValues() > 1 )
-		{
-			final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
-			final int numCols = getNumCols();
-			final int numVals = getNumValues();
-			final int n = getNumRows();
-			double[] c = target.getDenseBlock();
-			
-			//cache blocking config and position array
-			int[] apos = new int[numVals];					
-			
-			//cache conscious append via horizontal scans 
-			for( int bi=0; bi<n; bi+=blksz ) {
-				for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
-					int boff = _ptr[k];
-					int blen = len(k);					
-					int bix = apos[k];
-					if( bix >= blen ) 
-						continue;
-					int len = _data[boff+bix];
-					int pos = boff+bix+1;
-					for( int i=pos; i<pos+len; i++ ) {
-						c[bi+_data[i]] = _values[off+colpos];
-					}
-					apos[k] += len + 1;
+		final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		final int n = getNumRows();
+		double[] c = target.getDenseBlock();
+		
+		//cache blocking config and position array
+		int[] apos = allocIVector(numVals, true);
+		
+		//cache conscious append via horizontal scans 
+		for( int bi=0; bi<n; bi+=blksz ) {
+			for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
+				int boff = _ptr[k];
+				int blen = len(k);
+				int bix = apos[k];
+				if( bix >= blen )
+					continue;
+				int len = _data[boff+bix];
+				int pos = boff+bix+1;
+				for( int i=pos; i<pos+len; i++ ) {
+					c[bi+_data[i]] = _values[off+colpos];
 				}
-			}	
-			
-			target.recomputeNonZeros();
-		}
-		else
-		{
-			//call generic decompression with decoder
-			super.decompressToBlock(target, colpos);
+				apos[k] += len + 1;
+			}
 		}
+		target.recomputeNonZeros();
 	}
 	
 	@Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
index 63786d0..0e50262 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
@@ -192,47 +192,38 @@ public class ColGroupRLE extends ColGroupOffset
 	@Override
 	public void decompressToBlock(MatrixBlock target, int colpos) 
 	{
-		if( LOW_LEVEL_OPT && getNumValues() > 1 )
-		{
-			final int blksz = 128 * 1024;
-			final int numCols = getNumCols();
-			final int numVals = getNumValues();
-			final int n = getNumRows();
-			double[] c = target.getDenseBlock();
-			
-			//position and start offset arrays
-			int[] apos = new int[numVals];
-			int[] astart = new int[numVals];
-			
-			//cache conscious append via horizontal scans 
-			for( int bi=0; bi<n; bi+=blksz ) {
-				int bimax = Math.min(bi+blksz, n);					
-				for (int k=0, off=0; k < numVals; k++, off+=numCols) {
-					int boff = _ptr[k];
-					int blen = len(k);
-					int bix = apos[k];
-					if( bix >= blen ) 
-						continue;
-					int start = astart[k];
-					for( ; bix<blen & start<bimax; bix+=2) {
-						start += _data[boff + bix];
-						int len = _data[boff + bix+1];
-						for( int i=start; i<start+len; i++ )
-							c[i] = _values[off+colpos];
-						start += len;
-					}
-					apos[k] = bix;	
-					astart[k] = start;
+		final int blksz = 128 * 1024;
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
+		final int n = getNumRows();
+		double[] c = target.getDenseBlock();
+		
+		//position and start offset arrays
+		int[] astart = new int[numVals];
+		int[] apos = allocIVector(numVals, true);
+		
+		//cache conscious append via horizontal scans 
+		for( int bi=0; bi<n; bi+=blksz ) {
+			int bimax = Math.min(bi+blksz, n);
+			for (int k=0, off=0; k < numVals; k++, off+=numCols) {
+				int boff = _ptr[k];
+				int blen = len(k);
+				int bix = apos[k];
+				if( bix >= blen )
+					continue;
+				int start = astart[k];
+				for( ; bix<blen & start<bimax; bix+=2) {
+					start += _data[boff + bix];
+					int len = _data[boff + bix+1];
+					for( int i=start; i<start+len; i++ )
+						c[i] = _values[off+colpos];
+					start += len;
 				}
+				apos[k] = bix;	
+				astart[k] = start;
 			}
-			
-			target.recomputeNonZeros();
-		}
-		else
-		{
-			//call generic decompression with decoder
-			super.decompressToBlock(target, colpos);
 		}
+		target.recomputeNonZeros();
 	}
 	
 	@Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
index ead0b19..d7c66f1 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
@@ -102,7 +102,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 	public static final boolean TRANSPOSE_INPUT = true;
 	public static final boolean MATERIALIZE_ZEROS = false;
 	public static final long MIN_PAR_AGG_THRESHOLD = 16*1024*1024; //16MB
-	public static final boolean INVESTIGATE_ESTIMATES = false;
+	public static boolean INVESTIGATE_ESTIMATES = false;
 	public static boolean ALLOW_DDC_ENCODING = true;
 	private static final boolean LDEBUG = true; //local debug flag
 	private static final Level LDEBUG_LEVEL = Level.DEBUG; //DEBUG/TRACE for details
@@ -912,7 +912,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			BinaryOperator bop = new BinaryOperator(Multiply.getMultiplyFnObject());
 			LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
 		}
-		leftMultByVectorTranspose(_colGroups, tmp, out, true);
+		leftMultByVectorTranspose(_colGroups, tmp, out, true, true);
 		
 		//System.out.println("Compressed MMChain in "+time.stop());
 		
@@ -1003,7 +1003,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			if( op.getNumThreads()>1 )
 				leftMultByVectorTranspose(_colGroups, mb, ret, false, op.getNumThreads());
 			else
-				leftMultByVectorTranspose(_colGroups, mb, ret, false);
+				leftMultByVectorTranspose(_colGroups, mb, ret, false, true);
 		}
 		else {
 			//NOTE: we could decompress and invoke super.aggregateBinary but for now
@@ -1221,6 +1221,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			leftMultByTransposeSelf(_colGroups, out, 0, _colGroups.size());
 			
 			// post-processing
+			LinearAlgebraUtils.copyUpperToLowerTriangle(out);
 			out.recomputeNonZeros();
 		}
 		
@@ -1278,6 +1279,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			}
 			
 			// post-processing
+			LinearAlgebraUtils.copyUpperToLowerTriangle(out);
 			out.recomputeNonZeros();
 		}
 		
@@ -1402,7 +1404,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 	 * @param doTranspose if true, transpose vector
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	private static void leftMultByVectorTranspose(List<ColGroup> colGroups, MatrixBlock vector, MatrixBlock result, boolean doTranspose) 
+	private static void leftMultByVectorTranspose(List<ColGroup> colGroups, MatrixBlock vector, MatrixBlock result, boolean doTranspose, boolean allocTmp) 
 		throws DMLRuntimeException 
 	{
 		//transpose vector if required
@@ -1417,7 +1419,8 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		result.allocateDenseBlock();
 		
 		// setup memory pool for reuse
-		ColGroupValue.setupThreadLocalMemory(getMaxNumValues(colGroups));
+		if( allocTmp )
+			ColGroupValue.setupThreadLocalMemory(getMaxNumValues(colGroups));
 		
 		// delegate matrix-vector operation to each column group
 		for (ColGroup grp : colGroups) {			
@@ -1425,7 +1428,8 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		}
 		
 		// post-processing
-		ColGroupValue.cleanupThreadLocalMemory();
+		if( allocTmp )
+			ColGroupValue.cleanupThreadLocalMemory();
 		result.recomputeNonZeros();
 	}
 	
@@ -1488,9 +1492,14 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 		final int numRows = groups.get(0).getNumRows();
 		final int numGroups = groups.size();		
 		
-		//preallocated dense matrix block
-		MatrixBlock lhs = new MatrixBlock(numRows, 1, false);
+		//preallocated dense tmp matrix blocks
+		MatrixBlock lhs = new MatrixBlock(1, numRows, false);
+		MatrixBlock tmpret = new MatrixBlock(1, result.getNumColumns(), false);
 		lhs.allocateDenseBlock();
+		tmpret.allocateDenseBlock();
+		
+		// setup memory pool for reuse
+		ColGroupValue.setupThreadLocalMemory(getMaxNumValues(groups));
 		
 		//approach: for each colgroup, extract uncompressed columns one at-a-time
 		//vector-matrix multiplies against remaining col groups
@@ -1504,19 +1513,22 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable
 			//for all uncompressed lhs columns vectors
 			for( int j=0; j<ixgroup.length; j++ ) {
 				//decompress single column
-				lhs.reset(numRows, 1, false);				
+				if( !(group instanceof ColGroupDDC) )
+					lhs.reset(1, numRows, false);				
 				group.decompressToBlock(lhs, j);
 				
 				if( !lhs.isEmptyBlock(false) ) {
 					//compute vector-matrix partial result
-					MatrixBlock tmpret = new MatrixBlock(1,result.getNumColumns(),false);
-					leftMultByVectorTranspose(tmpList, lhs, tmpret, true);								
+					leftMultByVectorTranspose(tmpList, lhs, tmpret, false, false);								
 					
 					//write partial results (disjoint non-zeros)
-					LinearAlgebraUtils.copyNonZerosToRowCol(result, tmpret, ixgroup[j]);	
+					LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[j]);	
 				}
 			}
 		}
+		
+		//post processing
+		ColGroupValue.cleanupThreadLocalMemory();
 	}
 
 	@SuppressWarnings("unchecked")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
index 4cc86ed..515457b 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
@@ -147,25 +147,16 @@ public class LinearAlgebraUtils
 		return val;
 	}
 
-	public static void copyUpperToLowerTriangle( MatrixBlock ret )
-	{
-		double[] c = ret.getDenseBlock();
-		final int m = ret.getNumRows();
-		final int n = ret.getNumColumns();
-		
-		//copy symmetric values
-		for( int i=0, uix=0; i<m; i++, uix+=n )
-			for( int j=i+1, lix=j*n+i; j<n; j++, lix+=n )
-				c[ lix ] = c[ uix+j ];
+	public static void copyUpperToLowerTriangle( MatrixBlock ret ) {
+		LibMatrixMult.copyUpperToLowerTriangle(ret);
 	}
-
-	public static void copyNonZerosToRowCol( MatrixBlock ret, MatrixBlock tmp, int ix )
-	{
+	
+	public static void copyNonZerosToUpperTriangle( MatrixBlock ret, MatrixBlock tmp, int ix ) {
+		double[] a = tmp.getDenseBlock();
 		for(int i=0; i<tmp.getNumColumns(); i++) {
-			double val = tmp.quickGetValue(0, i);
-			if( val != 0 ) {
-				ret.setValueDenseUnsafe(ix, i, val);
-				ret.setValueDenseUnsafe(i, ix, val);
+			if( a[i] != 0 ) {
+				ret.setValueDenseUnsafe(
+					(ix<i)?ix:i, (ix<i)?i:ix, a[i]);
 			}
 		}
 	}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e1ae4a9f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 65f3be1..d746aa3 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -3414,7 +3414,7 @@ public class LibMatrixMult
 	 * 
 	 * @param ret matrix
 	 */
-	private static void copyUpperToLowerTriangle( MatrixBlock ret )
+	public static void copyUpperToLowerTriangle( MatrixBlock ret )
 	{
 		double[] c = ret.denseBlock;
 		final int m = ret.rlen;