You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/10/12 08:12:46 UTC
[4/5] systemml git commit: [SYSTEMML-1958] Performance sparse conv2d and dense-sparse mm (minor)

[SYSTEMML-1958] Performance sparse conv2d and dense-sparse mm (minor)

This patch makes two minor performance improvement, namely the
preallocation of matrix multiplication output blocks in sparse conv2d
(to amortize the allocation cost across images), and minor cleanups in
the core dense-sparse block matrix multiply. However, this dense-sparse
mm operations requires further improvement, but initial attempt were
unsuccessful. Finally, this patch also includes a fix for the recently
modified ultra-sparse mm code path.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a8f6a3cb
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a8f6a3cb
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a8f6a3cb

Branch: refs/heads/master
Commit: a8f6a3cb68af2cfae618fc0c3411997f2ab3ec83
Parents: deb4baf
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Oct 11 23:40:39 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Oct 12 01:13:08 2017 -0700

----------------------------------------------------------------------
 .../matrix/data/LibMatrixDNNConv2dHelper.java   |  46 +++---
 .../matrix/data/LibMatrixDNNIm2ColHelper.java   | 146 ++++++++-----------
 .../runtime/matrix/data/LibMatrixMult.java      |  78 +++++-----
 3 files changed, 117 insertions(+), 153 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index 2685f52..1036af7 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -128,8 +128,10 @@ public class LibMatrixDNNConv2dHelper {
 		@Override
 		public Long call() throws Exception {
 			int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S;
-			MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false);
-			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true);
+			MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false);
+			MatrixBlock outMM = new MatrixBlock(K, PQ, false);
+			LibMatrixDNNIm2ColHelper.Im2colWorker im2ColWorker = 
+					LibMatrixDNNIm2ColHelper.Im2colWorker.getWorker( _params.input1, outIm2col, _params, true);
 			long time1 = 0; long time2 = 0;
 			for(int n = _rl; n < _ru; n++)  {
 				// im2col(input) => _im2ColOutBlock
@@ -138,8 +140,8 @@ public class LibMatrixDNNConv2dHelper {
 				long t2 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
 				// filter %*% _im2ColOutBlock => matMultOutBlock
-				MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false);
-				LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, im2ColOutBlock, matMultOutBlock, false, true, _params);
+				outMM.reset(outMM.rlen, outMM.clen, false);
+				LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params);
 				long t3 = DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS ? System.nanoTime() : 0;
 				
 				if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) {
@@ -148,7 +150,7 @@ public class LibMatrixDNNConv2dHelper {
 				}
 				
 				// Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos
-				partialCopy1(matMultOutBlock, _params.output.getDenseBlock(), n*K*PQ, K, PQ);
+				partialCopy1(outMM, _params.output.getDenseBlock(), n*K*PQ, K, PQ);
 			}
 			if(_params.bias != null) {
 				// bias is always converted to dense format
@@ -165,27 +167,23 @@ public class LibMatrixDNNConv2dHelper {
 		private static void partialCopy1(MatrixBlock src, double [] dest, int destPos, int K, int PQ) {
 			// Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware.
 			// This is not required in Native implementation
-			if(!src.isEmptyBlock()) {
-				if(src.isInSparseFormat()) {
-					// Copy the sparse matrix matMultOutBlock of shape [K X PQ] to 
-					// params.output.denseBlock + destPos
-					for(int k = 0; k < src.getNumRows(); k++) {
-						if( !src.sparseBlock.isEmpty(k) ) {
-							int apos = src.sparseBlock.pos(k);
-							int alen = src.sparseBlock.size(k);
-							int[] aix = src.sparseBlock.indexes(k);
-							double[] avals = src.sparseBlock.values(k);
-							int desPosK = destPos + k*PQ;
-							for(int j = apos; j < apos+alen; j++) {
-								int pqIndex = aix[j];
-								dest[desPosK + pqIndex ] = avals[j];
-							}
-						}
-					}
+			if( src.isEmptyBlock() )
+				return;
+			if(src.isInSparseFormat()) {
+				SparseBlock sblock = src.sparseBlock;
+				for(int k = 0; k < src.getNumRows(); k++) {
+					if( sblock.isEmpty(k) ) continue;
+					int apos = sblock.pos(k);
+					int alen = sblock.size(k);
+					int[] aix = sblock.indexes(k);
+					double[] avals = sblock.values(k);
+					int desPosK = destPos + k*PQ;
+					for(int j = apos; j < apos+alen; j++)
+						dest[desPosK+aix[j]] = avals[j];
 				}
-				else 
-					System.arraycopy(src.denseBlock, 0, dest, destPos, K * PQ);
 			}
+			else 
+				System.arraycopy(src.denseBlock, 0, dest, destPos, K * PQ);
 		}
 	}
 	

http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
index d427a26..3296c7f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java
@@ -31,61 +31,38 @@ public class LibMatrixDNNIm2ColHelper {
 	static interface Im2colWorker {
 		public void execute(int n);
 		public void execute(int n, int c);
-		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean allChannels) {
-			if(allChannels) {
-				if(!input.isInSparseFormat()) {
-					// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
-					im2ColOutBlock.allocateDenseBlock();
-					if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0)  {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerStride1Pad0AllChannels operator to perform im2col.");
-						return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-					else {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform im2col.");
-						return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-				}
-				else {
-					if(LOG.isTraceEnabled()) LOG.trace("Using SparseIm2colWorkerAllChannels operator to perform im2col.");
-					double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
-					initializeSparseIm2ColBlock(im2ColOutBlock, (long)Math.ceil(params.P*params.Q*sparsity));
-					return new SparseSparseIm2colWorkerAllChannels(input, im2ColOutBlock, params);
-				}
+		public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean allChannels) {
+			if(!input.isInSparseFormat()) {
+				boolean stride1Pad0 = params.stride_h == 1 && params.stride_w == 1 
+						&& params.pad_h == 0 && params.pad_w == 0;
+				// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
+				out.reset(out.rlen, out.clen, false);
+				out.allocateDenseBlock();
+				if( LOG.isTraceEnabled() ) 
+					LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform "
+						+ "im2col (stride1pad0="+stride1Pad0+", allChannels="+allChannels+").");
+				if(allChannels && stride1Pad0 )
+					return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
+				else if( allChannels )
+					return new DenseIm2colWorkerAllChannels(input.getDenseBlock(), out.getDenseBlock(), params);
+				else if( stride1Pad0 )
+					return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), out.getDenseBlock(), params);
+				else
+					return new DenseIm2colWorker(input.getDenseBlock(), out.getDenseBlock(), params);
 			}
 			else {
-				if(!input.isInSparseFormat()) {
-					// Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format.
-					im2ColOutBlock.allocateDenseBlock();
-					if (params.stride_h == 1 && params.stride_w == 1 && params.pad_h == 0 && params.pad_w == 0) {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorkerStride1Pad0 operator to perform im2col.");
-						return new DenseIm2colWorkerStride1Pad0(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-					else {
-						if(LOG.isTraceEnabled()) LOG.trace("Using DenseIm2colWorker operator to perform im2col.");
-						return new DenseIm2colWorker(input.getDenseBlock(), im2ColOutBlock.getDenseBlock(), params);
-					}
-				}
-				else {
-					if(LOG.isTraceEnabled()) LOG.trace("Using SparseIm2colWorker operator to perform im2col.");
-					double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, (input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
-					initializeSparseIm2ColBlock(im2ColOutBlock, (long)Math.ceil(params.P*params.Q*sparsity));
-					return new SparseSparseIm2colWorker(input, im2ColOutBlock, params);
-				}
-			}
-		}
-		
-		static void initializeSparseIm2ColBlock(MatrixBlock im2ColOutBlock, long worstCaseNNPerRow) {
-			if(worstCaseNNPerRow >= Integer.MAX_VALUE)
-				throw new RuntimeException("The dimension of intermediate im2col matrix exceeded:" + worstCaseNNPerRow);
-			// Set to sparse
-			im2ColOutBlock.sparse = true;
-			im2ColOutBlock.denseBlock = null;
-			im2ColOutBlock.allocateSparseRowsBlock();
-			
-			for(int r = 0; r < im2ColOutBlock.getNumRows(); r++) {
-				im2ColOutBlock.getSparseBlock().allocate(r, (int) worstCaseNNPerRow);
+				if(LOG.isTraceEnabled()) 
+					LOG.trace("Using SparseIm2colWorker operator to perform im2col.");
+				out.reset(out.rlen, out.clen, true);
+				out.allocateSparseRowsBlock();
+				//preallocate sparse-rows
+				double sparsity = Math.min(MatrixBlock.SPARSITY_TURN_POINT, 
+					(input.getNonZeros()*2.0) / (input.getNumRows()*input.getNumColumns()));
+				for(int r = 0; r < out.rlen; r++)
+					out.getSparseBlock().allocate(r, (int)Math.ceil(params.P*params.Q*sparsity));
+				
+				return new SparseSparseIm2colWorkerAllChannels(input, out, params);
 			}
-			im2ColOutBlock.setNonZeros(0);
 		}
 	}
 	
@@ -275,17 +252,15 @@ public class LibMatrixDNNIm2ColHelper {
 	/**
 	 * Performing sparse im2col for all channels for a given row n of the input matrix.
 	 */
-	static class SparseSparseIm2colWorkerAllChannels implements Im2colWorker {
-		MatrixBlock input;  MatrixBlock output;
-		int CRS; int S; int R; int P; int Q; int H; int W; int RS; int HW;
-		int stride_h; int stride_w; int pad_h; int pad_w;
+	private static class SparseSparseIm2colWorkerAllChannels implements Im2colWorker {
+		final MatrixBlock input, output;
+		final int S, R, P, Q, W, HW;
+		final int stride_h, stride_w, pad_h, pad_w;
 		public SparseSparseIm2colWorkerAllChannels(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params) {
 			this.input = input;
 			this.output = im2ColOutBlock;
-			this.CRS = params.C * params.R * params.S;
-			this.RS = params.R * params.S;
 			this.HW = params.H * params.W;
-			this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
+			this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q;
 			this.stride_h = params.stride_h; this.stride_w = params.stride_w;
 			this.pad_h = params.pad_h; this.pad_w = params.pad_w;
 			if(!input.isInSparseFormat()) 
@@ -299,33 +274,32 @@ public class LibMatrixDNNIm2ColHelper {
 
 		@Override
 		public void execute(int n) {
-			if( !input.sparseBlock.isEmpty(n) ) {
-				output.sparseBlock.reset();
-				output.setNonZeros(0);
-				int apos = input.sparseBlock.pos(n);
-				int alen = input.sparseBlock.size(n);
-				int[] aix = input.sparseBlock.indexes(n);
-				double[] avals = input.sparseBlock.values(n);
-				
-				// Iterate over the sparse block
-				for(int j=apos; j<apos+alen; j++) {
-					// Note: the input is of shape [N, CHW]
-					int chw = aix[j];
-					
-					// Get individual zero-based c,h,w indexes from zero-based 'chw'
-					int cInput = chw / HW;
-					int hInput = (chw - cInput*HW)/W;
-					int wInput = chw % W; 
-					
-					appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
-							R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
-				}
-				// Since the chw are appended in sorted order, no need to sort the output rows
-				// if(meta.sortRows) output.sortSparseRows();
+			output.reset();
+			
+			SparseBlock sblock = input.sparseBlock;
+			if( sblock.isEmpty(n) ) {
+				return;
 			}
-			else {
-				output.setNonZeros(0);
+			
+			int apos = input.sparseBlock.pos(n);
+			int alen = input.sparseBlock.size(n);
+			int[] aix = input.sparseBlock.indexes(n);
+			double[] avals = input.sparseBlock.values(n);
+			
+			// Iterate over the sparse block
+			for(int j=apos; j<apos+alen; j++) {
+				// Note: the input is of shape [N, CHW]
+				int chw = aix[j];
+				
+				// Get individual zero-based c,h,w indexes from zero-based 'chw'
+				int cInput = chw / HW;
+				int hInput = (chw - cInput*HW)/W;
+				int wInput = chw % W; 
+				
+				appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], 
+						R, S, P, Q, stride_h, stride_w, pad_h, pad_w);
 			}
+			// Since the chw are appended in sorted order, no need to sort the output rows
 		}
 	}
 	
@@ -365,7 +339,7 @@ public class LibMatrixDNNIm2ColHelper {
 		int sMax = Math.min(S-1, wInput + pad_w);
 		// Constraint 3: (hInput - r + pad_h) % stride_h == 0
 		while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) rMin++;
-		while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++;	
+		while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++;
 		
 		for(int r = rMin; r <= rMax; r += stride_h) {
 			// Only append value if h == hInput, where h = (r - pad_h) + p*stride_h and 0 <= p < P
@@ -378,8 +352,6 @@ public class LibMatrixDNNIm2ColHelper {
 				// chw -> [crs, pq]
 				output.appendValue(outRowIndex + s, pQ + q, value);
 				// Since the chw are appended in sorted order, no need to sort the output rows
-				// if(meta.lastIndexPerRow[outRowIndex + s] > p*Q + q) meta.sortRows = true;
-				// meta.lastIndexPerRow[outRowIndex + s] = p*Q + q;
 			}
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/a8f6a3cb/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index aedf975..181ff98 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -126,8 +126,7 @@ public class LibMatrixMult
 		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
 		m2 = prepMatrixMultRightInput(m1, m2);
 		ret.sparse = ultraSparse;
-		if( !ret.sparse )
-			ret.allocateDenseBlock();
+		ret.allocateBlock();
 		
 		//prepare row-upper for special cases of vector-matrix
 		boolean pm2 = checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
@@ -1094,7 +1093,7 @@ public class LibMatrixMult
 
 	private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
 		throws DMLRuntimeException 
-	{	
+	{
 		double[] a = m1.denseBlock;
 		double[] c = ret.denseBlock;
 		int m = m1.rlen;
@@ -1104,42 +1103,37 @@ public class LibMatrixMult
 		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
 		if( LOW_LEVEL_OPTIMIZATION )
 		{
-			final int blocksizeI = 32; //256KB c block (typical L2 size per core), 32KB a block 
-			final int blocksizeK = 32; 
-			//note: in contrast to dense-dense, no blocking over j (would require maintaining blocksizeK indexes, counter-productive on skew)
-			
 			SparseBlock b = m2.sparseBlock;
 			
 			if( pm2 && m==1 )          //VECTOR-MATRIX
 			{
 				//parallelization over rows in rhs matrix
 				for( int k=rl; k<ru; k++ )
-					if( a[k] != 0 && !b.isEmpty(k) ) {								
+					if( a[k] != 0 && !b.isEmpty(k) ) {
 						vectMultiplyAdd(a[k], b.values(k), c, b.indexes(k), b.pos(k), 0, b.size(k));
 					}
 			}
 			else                       //MATRIX-MATRIX
 			{
+				//best effort blocking, without blocking over J because it is 
+				//counter-productive, even with front of current indexes
+				final int blocksizeK = 32;
+				final int blocksizeI = 32;
+				
 				//blocked execution
 				for( int bi = rl; bi < ru; bi+=blocksizeI )
-					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
-					{
-						int bklen = Math.min(cd, bk+blocksizeK)-bk;
-						
+					for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
+						int bkmin = Math.min(cd, bk+blocksizeK);
 						//core sub block matrix multiplication
-			    		for( int i = bi; i < bimin; i++) 
-			    		{
-			    			int aixi = i * cd + bk; //start index on a
-			    			int cixj = i * n + 0; //scan index on c
-			    			
-			    			for( int k = 0; k < bklen; k++ )
-							{
-								double val = a[aixi+k];
-								if( val != 0 && !b.isEmpty(bk+k) ) {
-									vectMultiplyAdd(val, b.values(bk+k), c, b.indexes(bk+k), b.pos(bk+k), cixj, b.size(bk+k));
-								}
+						for(int i = bi, aix=bi*cd, cix=bi*n; i < bimin; i++, aix+=cd, cix+=n) {
+							for( int k = bk; k < bkmin; k++ ) {
+								double aval = a[aix+k];
+								if( aval == 0 || b.isEmpty(k) )
+									continue;
+								vectMultiplyAdd(aval, b.values(k), c, 
+									b.indexes(k), b.pos(k), cix, b.size(k));
 							}
-			    		}
+						}
 					}
 			}
 		}
@@ -1159,10 +1153,10 @@ public class LibMatrixMult
 							int[] bix = b.indexes(k);
 							double[] bvals = b.values(k);	
 							for(int j = bpos; j < bpos+blen; j++)
-								c[cix+bix[j]] += val * bvals[j];								
+								c[cix+bix[j]] += val * bvals[j];
 						}
 					}
-				}		
+				}
 		}
 	}
 
@@ -1190,7 +1184,7 @@ public class LibMatrixMult
 			{
 				for( int i=rl; i<ru; i++ )
 					if( !a.isEmpty(i) )
-						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));							
+						c[i] = dotProduct(a.values(i), b, a.indexes(i), a.pos(i), 0, a.size(i));
 			}
 			else if( n==1 )               //MATRIX-VECTOR (tall rhs)
 			{
@@ -1206,13 +1200,13 @@ public class LibMatrixMult
 							int apos = a.pos(i);
 							int alen = a.size(i);
 							int[] aix = a.indexes(i);
-							double[] avals = a.values(i);					
-							int k = curk[i-bi] + apos;									
+							double[] avals = a.values(i);
+							int k = curk[i-bi] + apos;
 							for( ; k<apos+alen && aix[k]<bkmin; k++ )
 								c[i] += avals[k] * b[aix[k]];
 							curk[i-bi] = k - apos;
 						}
-					}	
+					}
 				}
 			}
 			else if( pm2 && m==1 )        //VECTOR-MATRIX
@@ -1222,7 +1216,7 @@ public class LibMatrixMult
 				{
 					int alen = a.size(0);
 					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);					
+					double[] avals = a.values(0);
 					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
 					rlix = (rlix>=0) ? rlix : alen;
 					
@@ -1243,7 +1237,7 @@ public class LibMatrixMult
 						int apos = a.pos(i);
 						int alen = a.size(i);
 						int[] aix = a.indexes(i);
-						double[] avals = a.values(i);					
+						double[] avals = a.values(i);
 						
 						int k1 = (rl==0) ? apos : a.posFIndexGTE(i, rl);
 						k1 = (k1>=0) ? k1 : apos+alen;
@@ -1274,7 +1268,7 @@ public class LibMatrixMult
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
+					double[] avals = a.values(i);
 					//rest not aligned to blocks of 4 rows
 					int bn = alen%4;
 					for( int k=apos; k<apos+bn; k++ )
@@ -1286,7 +1280,7 @@ public class LibMatrixMult
 				}	
 			}
 			else                          //MATRIX-MATRIX
-			{							
+			{
 				//blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, 
 				//while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j
 				//in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and
@@ -1312,9 +1306,9 @@ public class LibMatrixMult
 									int apos = a.pos(i);
 									int alen = a.size(i);
 									int[] aix = a.indexes(i);
-									double[] avals = a.values(i);					
+									double[] avals = a.values(i);
 									
-									int k = curk[i-bi] + apos;			
+									int k = curk[i-bi] + apos;
 					    			//rest not aligned to blocks of 4 rows
 									int bn = alen%4;
 									for( ; k<apos+bn && aix[k]<bkmin; k++ )
@@ -1343,13 +1337,13 @@ public class LibMatrixMult
 					int apos = a.pos(i);
 					int alen = a.size(i);
 					int[] aix = a.indexes(i);
-					double[] avals = a.values(i);					
+					double[] avals = a.values(i);
 					
 					for(int k = apos; k < apos+alen; k++) {
 						double val = avals[k];
 						for(int j = 0, bix=aix[k]*n; j < n; j++)
-							c[cix+j] += val * b[bix+j];								
-					}						
+							c[cix+j] += val * b[bix+j];
+					}
 				}
 			}
 		}
@@ -1375,7 +1369,7 @@ public class LibMatrixMult
 				{
 					int alen = a.size(0);
 					int[] aix = a.indexes(0);
-					double[] avals = a.values(0);					
+					double[] avals = a.values(0);
 					int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
 					rlix = (rlix>=0) ? rlix : alen;
 					
@@ -1384,9 +1378,9 @@ public class LibMatrixMult
 							int bpos = b.pos(aix[k]);
 							int blen = b.size(aix[k]);
 							int[] bix = b.indexes(aix[k]);
-							double[] bvals = b.values(aix[k]);								
+							double[] bvals = b.values(aix[k]);
 							vectMultiplyAdd(avals[k], bvals, c, bix, bpos, 0, blen);
-						}			
+						}
 				}
 			}	
 			else                       //MATRIX-MATRIX