You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/06/10 19:07:11 UTC

[3/3] systemml git commit: [FIX] Fixed nested parfor for GPUs

[FIX] Fixed nested parfor for GPUs

Additionally
- Fixed intellij codestyle accordingly
- Fixed formatting of some GPU related source files

Closes #532


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f5871756
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f5871756
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f5871756

Branch: refs/heads/master
Commit: f587175647a84a3825b174b4d29c0398be17331f
Parents: 0bcae49
Author: Nakul Jindal <na...@gmail.com>
Authored: Sat Jun 10 12:06:47 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Sat Jun 10 12:06:47 2017 -0700

----------------------------------------------------------------------
 dev/code-style/systemml-style-intellij.xml      |   37 +-
 .../apache/sysml/api/ScriptExecutorUtils.java   |   17 +-
 .../controlprogram/ParForProgramBlock.java      |   19 +-
 .../context/ExecutionContext.java               |   64 +-
 .../controlprogram/parfor/LocalParWorker.java   |    2 +-
 .../cp/FunctionCallCPInstruction.java           |   12 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |    4 +-
 .../gpu/AggregateUnaryGPUInstruction.java       |    2 +-
 .../gpu/ConvolutionGPUInstruction.java          |   18 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   |    2 +-
 .../gpu/MatrixBuiltinGPUInstruction.java        |   30 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |    2 +-
 .../gpu/MatrixMatrixAxpyGPUInstruction.java     |    2 +-
 .../gpu/MatrixMatrixBuiltinGPUInstruction.java  |    2 +-
 .../instructions/gpu/ReorgGPUInstruction.java   |    2 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    2 +-
 .../instructions/gpu/context/CSRPointer.java    |  922 ++++++-------
 .../gpu/context/ExecutionConfig.java            |   85 +-
 .../instructions/gpu/context/GPUContext.java    | 1257 +++++++++---------
 .../gpu/context/GPUContextPool.java             |  266 ++--
 .../instructions/gpu/context/GPUObject.java     |  454 ++++---
 .../instructions/gpu/context/JCudaKernels.java  |  141 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |   42 +-
 .../runtime/matrix/data/LibMatrixDNNHelper.java |    1 +
 .../org/apache/sysml/test/gpu/GPUTests.java     |   47 +-
 .../sysml/test/gpu/NeuralNetworkOpTests.java    |  106 +-
 26 files changed, 1917 insertions(+), 1621 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml
index 1ad3209..b4a53b4 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -1,28 +1,27 @@
 <!--
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
 -->
-
 <code_scheme name="SystemML Format">
   <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
   <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
   <option name="IMPORT_LAYOUT_TABLE">
     <value>
-      <package name="" withSubpackages="true" static="false" />
+      <package name="" withSubpackages="true" static="true" />
       <emptyLine />
       <package name="java" withSubpackages="true" static="false" />
       <emptyLine />
@@ -32,7 +31,7 @@
       <emptyLine />
       <package name="com" withSubpackages="true" static="false" />
       <emptyLine />
-      <package name="" withSubpackages="true" static="true" />
+      <package name="" withSubpackages="true" static="false" />
     </value>
   </option>
   <codeStyleSettings language="JAVA">

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 674a011..2895aa4 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.api;
 
+import java.util.List;
+
 import org.apache.sysml.api.mlcontext.ScriptExecutor;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
@@ -79,23 +81,22 @@ public class ScriptExecutorUtils {
 		// GPUs
 		GPUContextPool.PER_PROCESS_MAX_GPUS = dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS);
 		Statistics.startRunTimer();
-		GPUContext gCtx = null;
 		try {
 			// run execute (w/ exception handling to ensure proper shutdown)
 			if (DMLScript.USE_ACCELERATOR && ec != null) {
-				gCtx = GPUContextPool.getFromPool();
-				if (gCtx == null) {
+				List<GPUContext> gCtxs = GPUContextPool.reserveAllGPUContexts();
+				if (gCtxs == null) {
 					throw new DMLRuntimeException(
 							"GPU : Could not create GPUContext, either no GPU or all GPUs currently in use");
 				}
-				gCtx.initializeThread();
-				ec.setGPUContext(gCtx);
+				gCtxs.get(0).initializeThread();
+				ec.setGPUContexts(gCtxs);
 			}
 			rtprog.execute(ec);
 		} finally { // ensure cleanup/shutdown
-			if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) {
-				ec.getGPUContext().clearTemporaryMemory();
-				GPUContextPool.returnToPool(ec.getGPUContext());
+			if (DMLScript.USE_ACCELERATOR && !ec.getGPUContexts().isEmpty()) {
+				ec.getGPUContexts().forEach(gCtx -> gCtx.clearTemporaryMemory());
+				GPUContextPool.freeAllGPUContexts();
 			}
 			if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))
 				SpoofCompiler.cleanupCodeGenerator();

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index c9dcc22..95e28e7 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -632,9 +632,7 @@ public class ParForProgramBlock extends ForProgramBlock
 			{
 				case LOCAL: //create parworkers as local threads
 					if (DMLScript.USE_ACCELERATOR) {
-						GPUContextPool.returnToPool(ec.getGPUContext());
-						ec.setGPUContext(null);
-						setDegreeOfParallelism(GPUContextPool.getDeviceCount());
+						setDegreeOfParallelism(ec.getNumGPUContexts());
 					}
 					executeLocalParFor(ec, iterVar, from, to, incr);
 					break;
@@ -757,7 +755,7 @@ public class ParForProgramBlock extends ForProgramBlock
 			{
 				//create parallel workers as (lazy) deep copies
 				//including preparation of update-in-place variables
-				workers[i] = createParallelWorker( _pwIDs[i], queue, ec ); 
+				workers[i] = createParallelWorker( _pwIDs[i], queue, ec, i);
 				threads[i] = new Thread( workers[i] );
 				threads[i].setPriority(Thread.MAX_PRIORITY); 
 			}
@@ -833,11 +831,9 @@ public class ParForProgramBlock extends ForProgramBlock
 			// the main thread to use the GPUContext
 			if (DMLScript.USE_ACCELERATOR) {
 				for (int i = 0; i < _numThreads; i++) {
-					GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
-					GPUContextPool.returnToPool(gCtx);
+					workers[i].getExecutionContext().setGPUContexts(null);
 				}
-				ec.setGPUContext(GPUContextPool.getFromPool());
-				ec.getGPUContext().initializeThread();
+				ec.getGPUContext(0).initializeThread();
 			}
 		}
 		finally 
@@ -1386,10 +1382,11 @@ public class ParForProgramBlock extends ForProgramBlock
 	 * @param pwID parworker id
 	 * @param queue task queue
 	 * @param ec execution context
+	 * @param index the index of the worker
 	 * @return local parworker
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-	private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec) 
+	private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index)
 		throws DMLRuntimeException
 	{
 		LocalParWorker pw = null; 
@@ -1420,9 +1417,9 @@ public class ParForProgramBlock extends ForProgramBlock
 			ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
 
 			// If GPU mode is enabled, gets a GPUContext from the pool of GPUContexts
-			// and sets it in the ExecutionContext
+			// and sets it in the ExecutionContext of the parfor
 			if (DMLScript.USE_ACCELERATOR){
-				cpEc.setGPUContext(GPUContextPool.getFromPool());
+				cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
 			}
 			
 			//prepare basic update-in-place variables (vars dropped on result merge)

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 735f394..fb179f5 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -20,7 +20,9 @@
 package org.apache.sysml.runtime.controlprogram.context;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -66,7 +68,10 @@ public class ExecutionContext {
 	//debugging (optional)
 	protected DebugState _dbState = null;
 
-    protected GPUContext _gpuContext = null;
+	/**
+	 * List of {@link GPUContext}s owned by this {@link ExecutionContext}
+	 */
+    protected List<GPUContext> _gpuContexts = new ArrayList<>();
 
 	protected ExecutionContext()
 	{
@@ -99,13 +104,42 @@ public class ExecutionContext {
 		_variables = vars;
 	}
 
-    public GPUContext getGPUContext() {
-        return _gpuContext;
+	/**
+	 * Get the i-th GPUContext
+	 * @param index index of the GPUContext
+	 * @return a valid GPUContext or null if the indexed GPUContext does not exist.
+	 */
+    public GPUContext getGPUContext(int index) {
+    	try {
+			return _gpuContexts.get(index);
+		} catch (IndexOutOfBoundsException e){
+    		return null;
+		}
     }
 
-    public void setGPUContext(GPUContext _gpuContext) {
-        this._gpuContext = _gpuContext;
-    }
+	/**
+	 * Sets the list of GPUContexts
+	 * @param gpuContexts a collection of GPUContexts
+	 */
+	public void setGPUContexts(List<GPUContext> gpuContexts){
+		_gpuContexts = gpuContexts;
+	}
+
+	/**
+	 * Gets the list of GPUContexts
+	 * @return a list of GPUContexts
+	 */
+	public List<GPUContext> getGPUContexts() {
+		return _gpuContexts;
+	}
+
+	/**
+	 * Gets the number of GPUContexts
+	 * @return number of GPUContexts
+	 */
+	public int getNumGPUContexts() {
+    	return _gpuContexts.size();
+	}
 
 	/* -------------------------------------------------------
 	 * Methods to handle variables and associated data objects
@@ -238,7 +272,7 @@ public class ExecutionContext {
 		throws DMLRuntimeException 
 	{	
 		MatrixObject mo = allocateGPUMatrixObject(varName);
-		boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense();
+		boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
 		mo.getMatrixCharacteristics().setNonZeros(-1);
 		return new Pair<MatrixObject, Boolean>(mo, allocated);
 	}
@@ -257,7 +291,7 @@ public class ExecutionContext {
     {
         MatrixObject mo = allocateGPUMatrixObject(varName);
         mo.getMatrixCharacteristics().setNonZeros(nnz);
-				boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse();
+				boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
         return new Pair<MatrixObject, Boolean>(mo, allocated);
     } 
 
@@ -269,12 +303,12 @@ public class ExecutionContext {
 	 */
 	public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException {
 		MatrixObject mo = getMatrixObject(varName);
-		if( mo.getGPUObject(getGPUContext()) == null ) {
-			GPUObject newGObj = getGPUContext().createGPUObject(mo);
+		if( mo.getGPUObject(getGPUContext(0)) == null ) {
+			GPUObject newGObj = getGPUContext(0).createGPUObject(mo);
 			// The lock is added here for an output block
 			// so that any block currently in use is not deallocated by eviction on the GPU
 			newGObj.addLock();
-			mo.setGPUObject(getGPUContext(), newGObj);
+			mo.setGPUObject(getGPUContext(0), newGObj);
 		}
 		return mo;
 	}
@@ -282,7 +316,7 @@ public class ExecutionContext {
 	public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName)
 			throws DMLRuntimeException 
 	{
-		GPUContext gCtx = getGPUContext();
+		GPUContext gCtx = getGPUContext(0);
 		boolean copied = false;
 		MatrixObject mo = getMatrixObject(varName);
 		if(mo == null) {
@@ -322,7 +356,7 @@ public class ExecutionContext {
 		throws DMLRuntimeException 
 	{
 		MatrixObject mo = getMatrixObject(varName);
-		mo.getGPUObject(getGPUContext()).releaseInput();
+		mo.getGPUObject(getGPUContext(0)).releaseInput();
 	}
 	
 	/**
@@ -374,10 +408,10 @@ public class ExecutionContext {
 	
 	public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException {
 		MatrixObject mo = getMatrixObject(varName);
-		if(mo.getGPUObject(getGPUContext()) == null || !mo.getGPUObject(getGPUContext()).isAllocated()) {
+		if(mo.getGPUObject(getGPUContext(0)) == null || !mo.getGPUObject(getGPUContext(0)).isAllocated()) {
 			throw new DMLRuntimeException("No output is allocated on GPU");
 		}
-		mo.getGPUObject(getGPUContext()).releaseOutput();
+		mo.getGPUObject(getGPUContext(0)).releaseOutput();
 	}
 
 	public void setMatrixOutput(String varName, MatrixBlock outputData) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index c4684ec..636b1f8 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -83,7 +83,7 @@ public class LocalParWorker extends ParWorker implements Runnable
 
 		// Initialize this GPUContext to this thread
 		if (DMLScript.USE_ACCELERATOR)
-			_ec.getGPUContext().initializeThread();
+			_ec.getGPUContext(0).initializeThread();
 		
 		//setup compiler config for worker thread
 		ConfigurationManager.setLocalConfig(_cconf);

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 9cc6bb2..3cd2633 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -168,9 +168,9 @@ public class FunctionCallCPInstruction extends CPInstruction
 		// and copy the function arguments into the created table. 
 		ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram());
 		if (DMLScript.USE_ACCELERATOR) {
-			fn_ec.setGPUContext(ec.getGPUContext());
-			ec.setGPUContext(null);
-			fn_ec.getGPUContext().initializeThread();
+			fn_ec.setGPUContexts(ec.getGPUContexts());
+			ec.setGPUContexts(null);
+			fn_ec.getGPUContext(0).initializeThread();
 		}
 		fn_ec.setVariables(functionVariables);
 		// execute the function block
@@ -206,9 +206,9 @@ public class FunctionCallCPInstruction extends CPInstruction
 		ec.unpinVariables(_boundInputParamNames, pinStatus);
 
 		if (DMLScript.USE_ACCELERATOR) {
-			ec.setGPUContext(fn_ec.getGPUContext());
-			fn_ec.setGPUContext(null);
-			ec.getGPUContext().initializeThread();
+			ec.setGPUContexts(fn_ec.getGPUContexts());
+			fn_ec.setGPUContexts(null);
+			ec.getGPUContext(0).initializeThread();
 		}
 		
 		// add the updated binding for each return variable to the variables in original symbol table

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 2531c17..0c0a4b2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction
 		int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns());
 
 		ec.setMetaData(_output.getName(), rlen, clen);
-		LibMatrixCUDA.matmult(ec, ec.getGPUContext(), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
+		LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
         
 		//release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction
 	@SuppressWarnings("unused")
 	private boolean isSparse(ExecutionContext ec, String var) throws DMLRuntimeException {
 		MatrixObject mo = ec.getMatrixObject(var);
-		return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo);
+		return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(0), mo);
 	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 8bdd397..5d01820 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -93,7 +93,7 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction {
       ec.setMetaData(_output.getName(), rlen, 1);
     }
 
-    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr);
+    LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr);
 
     //release inputs/outputs
     ec.releaseMatrixInputForGPUInstruction(_input1.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 9d4cd1f..e5ea097 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -186,9 +186,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 		ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns());
 		MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
 		if(instOpcode.equalsIgnoreCase("bias_add"))
-			LibMatrixCUDA.biasAdd(ec.getGPUContext(), getExtendedOpcode(), input, bias, out);
+			LibMatrixCUDA.biasAdd(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out);
 		else if(instOpcode.equalsIgnoreCase("bias_multiply"))
-			LibMatrixCUDA.biasMultiply(ec.getGPUContext(), getExtendedOpcode(), input, bias, out);
+			LibMatrixCUDA.biasMultiply(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out);
 		// release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -202,7 +202,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 		
 		MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
 		ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns());
-		LibMatrixCUDA.reluBackward(ec.getGPUContext(), getExtendedOpcode(), input, dout, out);
+		LibMatrixCUDA.reluBackward(ec.getGPUContext(0), getExtendedOpcode(), input, dout, out);
 		// release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -253,7 +253,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			
 			ec.setMetaData(_output.getName(), N, K * P * Q);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-			LibMatrixCUDA.conv2d(ec.getGPUContext(), getExtendedOpcode(), image, filter, out, N, C, H, W,
+			LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -268,7 +268,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			
 			ec.setMetaData(_output.getName(), N, K * P * Q);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-			LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+			LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
 						K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
@@ -283,7 +283,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			
 			ec.setMetaData(_output.getName(), K, C * R * S);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-			LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W,
+			LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 			// TODO: For now always copy the device data to host
 			// ec.gpuCtx.copyDeviceToHost(outputBlock);
@@ -300,7 +300,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			
 			ec.setMetaData(_output.getName(), N, C * H * W);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W,
+			LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling")) {
@@ -313,7 +313,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			ec.setMetaData(_output.getName(), N, C * P * Q);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
 			if(instOpcode.equalsIgnoreCase("maxpooling"))
-				LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
+				LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
@@ -328,7 +328,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
 			
 			ec.setMetaData(_output.getName(), N, C * H * W);
 			MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
-			LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W,
+			LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
 					K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
 		}
 		else {

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index c147a6f..55656f0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
 
                 //execute operations 
                 ec.setMetaData(_output.getName(), rlen, clen);
-                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
+                LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
                 
                 ec.releaseMatrixInputForGPUInstruction(_input.getName());
                 ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 7b50285..beeacee 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -44,35 +44,35 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
 
 		switch(opcode) {
 			case "sel+":
-				LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "exp":
-				LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "sqrt":
-				LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.sqrt(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "log":
-				LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.log(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "round":
-				LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.round(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "floor":
-				LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.floor(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "ceil":
-				LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.ceil(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "abs":
-				LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.abs(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "sin":
-				LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.sin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "cos":
-				LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.cos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "tan":
-				LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.tan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "asin":
-				LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.asin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "acos":
-				LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.acos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "atan":
-				LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.atan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			case "sign":
-				LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+				LibMatrixCUDA.sign(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
 			default:
 				throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 9573a1a..a03f9b1 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
 		ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
 		
 		BinaryOperator bop = (BinaryOperator) _optr;
-		LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+		LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
 		
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index 58905d6..e430e29 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi
 					" and input2:" + rlen2 + " X " + clen2);
 		}
 		
-		LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
+		LibMatrixCUDA.axpy(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
 		
 		ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 		ec.releaseMatrixInputForGPUInstruction(_input2.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
index 8936735..e60a3d7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
@@ -46,7 +46,7 @@ public class MatrixMatrixBuiltinGPUInstruction extends BuiltinBinaryGPUInstructi
 
     if(opcode.equals("solve")) {
       ec.setMetaData(output.getName(), mat1.getNumColumns(), 1);
-      LibMatrixCUDA.solve(ec, ec.getGPUContext(), getExtendedOpcode(), mat1, mat2, output.getName());
+      LibMatrixCUDA.solve(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
 
     } else {
       throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 53d56a3..bc63d12 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -81,7 +81,7 @@ public class ReorgGPUInstruction extends GPUInstruction
 		
 		//execute operation
 		ec.setMetaData(_output.getName(), rlen, clen);
-		LibMatrixCUDA.transpose(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
+		LibMatrixCUDA.transpose(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName());
 		
 		//release inputs/outputs
 		ec.releaseMatrixInputForGPUInstruction(_input.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 64cb6c4..ea4665a 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
 		ScalarOperator sc_op = (ScalarOperator) _optr;
 		sc_op.setConstant(constant.getDoubleValue());
 		
-		LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
+		LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
 		
 		ec.releaseMatrixInputForGPUInstruction(mat.getName());
 		ec.releaseMatrixOutputForGPUInstruction(_output.getName());

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 0ff9d14..b15dd69 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -52,453 +52,477 @@ import jcuda.jcusparse.cusparsePointerMode;
  */
 public class CSRPointer {
 
-  private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
-
-  private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
-
-  /** {@link GPUContext} instance to track the GPU to do work on */
-  private final GPUContext gpuContext;
-
-  public static cusparseMatDescr matrixDescriptor;
-
-  /** Number of non zeroes */
-  public long nnz;
-
-  /** double array of non zero values */
-  public Pointer val;
-
-  /** integer array of start of all rows and end of last row + 1 */
-  public Pointer rowPtr;
-
-  /** integer array of nnz values' column indices */
-  public Pointer colInd;
-
-  /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */
-  public cusparseMatDescr descr;
-
-
-  public CSRPointer clone(int rows) throws DMLRuntimeException {
-    CSRPointer me = this;
-    CSRPointer that = new CSRPointer(me.getGPUContext());
-
-    that.allocateMatDescrPointer();
-    long totalSize = estimateSize(me.nnz, rows);
-    that.gpuContext.ensureFreeSpace(totalSize);
-
-    that.nnz = me.nnz;
-    that.val = allocate(that.nnz * Sizeof.DOUBLE);
-    that.rowPtr = allocate(rows * Sizeof.DOUBLE);
-    that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
-
-    cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-    cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-    cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-
-    return that;
-  }
-
-  /**
-   * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
-   * @param gCtx   a valid {@link GPUContext}
-   */
-  private CSRPointer(GPUContext gCtx) {
-    gpuContext = gCtx;
-    val = new Pointer();
-    rowPtr = new Pointer();
-    colInd = new Pointer();
-    allocateMatDescrPointer();
-  }
-
-//  private Pointer allocate(String instName, long size) throws DMLRuntimeException {
-//    return getGPUContext().allocate(instName, size);
-//  }
-
-  private Pointer allocate(long size) throws DMLRuntimeException {
-    return getGPUContext().allocate(size);
-  }
-
-//  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
-//    getGPUContext().cudaFreeHelper(toFree);
-//  }
-
-  private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
-    getGPUContext().cudaFreeHelper(toFree, eager);
-  }
-
-//  private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
-//    getGPUContext().cudaFreeHelper(instName, toFree, eager);
-//  }
-
-  private static long getDoubleSizeOf(long numElems) {
-    return numElems * ((long)jcuda.Sizeof.DOUBLE);
-  }
-
-  private static long getIntSizeOf(long numElems) {
-    return numElems * ((long)jcuda.Sizeof.INT);
-  }
-
-  private GPUContext getGPUContext() {
-    return gpuContext;
-  }
-
-  public static int toIntExact(long l) throws DMLRuntimeException {
-    if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Cannot be cast to int:" + l);
-    }
-    return (int) l;
-  }
-
-  /**
-   * @return Singleton default matrix descriptor object
-   * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
-   */
-  public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
-    if (matrixDescriptor == null) {
-      // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
-      matrixDescriptor = new cusparseMatDescr();
-      cusparseCreateMatDescr(matrixDescriptor);
-      cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
-    }
-    return matrixDescriptor;
-  }
-
-  /**
-   * Estimate the size of a CSR matrix in GPU memory
-   * Size of pointers is not needed and is not added in
-   *
-   * @param nnz2 number of non zeroes
-   * @param rows number of rows
-   * @return size estimate
-   */
-  public static long estimateSize(long nnz2, long rows) {
-    long sizeofValArray = getDoubleSizeOf(nnz2);
-    long sizeofRowPtrArray = getIntSizeOf(rows + 1);
-    long sizeofColIndArray = getIntSizeOf(nnz2);
-    long sizeofDescr = getIntSizeOf(4);
-    // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
-    // typedef struct {
-    // 	cusparseMatrixType_t MatrixType;
-    //	cusparseFillMode_t FillMode;
-    //	cusparseDiagType_t DiagType;
-    // 	cusparseIndexBase_t IndexBase;
-    // } cusparseMatDescr_t;
-    long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
-    return tot;
-  }
-
-  /**
-   * Static method to copy a CSR sparse matrix from Host to Device
-   *
-   * @param dest   [input] destination location (on GPU)
-   * @param rows   number of rows
-   * @param nnz    number of non-zeroes
-   * @param rowPtr integer array of row pointers
-   * @param colInd integer array of column indices
-   * @param values double array of non zero values
-   */
-  public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
-    CSRPointer r = dest;
-    long t0 = 0;
-    if (DMLScript.STATISTICS) t0 = System.nanoTime();
-    r.nnz = nnz;
-    cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
-    cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
-    cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
-  }
-
-  /**
-   * Static method to copy a CSR sparse matrix from Device to host
-   *
-   * @param src    [input] source location (on GPU)
-   * @param rows   [input] number of rows
-   * @param nnz    [input] number of non-zeroes
-   * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
-   * @param colInd [output] pre-allocated integer array of column indices of size nnz
-   * @param values [output] pre-allocated double array of values of size nnz
-   */
-  public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
-    CSRPointer r = src;
-    long t0 = 0;
-    if (DMLScript.STATISTICS) t0 = System.nanoTime();
-    cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
-    cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
-    cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
-    if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
-  }
-
-  /**
-   * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
-   * C = a op(A) + b op(B)
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param B      Sparse Matrix B on GPU
-   * @param m      Rows in A
-   * @param n      Columns in Bs
-   * @return CSR (compressed sparse row) pointer
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n)
-          throws DMLRuntimeException {
-    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
-    }
-    CSRPointer C = new CSRPointer(gCtx);
-    step1AllocateRowPointers(gCtx, handle, C, m);
-    step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
-    step3AllocateValNInd(gCtx, handle, C);
-    return C;
-  }
-
-  /**
-   * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
-   * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
-   *
-   * @param gCtx ?
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param transA 'T' if A is to be transposed, 'N' otherwise
-   * @param B      Sparse Matrix B on GPU
-   * @param transB 'T' if B is to be transposed, 'N' otherwise
-   * @param m      Rows in A
-   * @param n      Columns in B
-   * @param k      Columns in A / Rows in B
-   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k)
-          throws DMLRuntimeException {
-    // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
-    // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
-    CSRPointer C = new CSRPointer(gCtx);
-    step1AllocateRowPointers(gCtx, handle, C, m);
-    step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
-    step3AllocateValNInd(gCtx, handle, C);
-    return C;
-  }
-
-  /**
-   * Check for ultra sparsity
-   *
-   * @param rows number of rows
-   * @param cols number of columns
-   * @return true if ultra sparse
-   */
-  public boolean isUltraSparse(int rows, int cols) {
-    double sp = ((double) nnz / rows / cols);
-    return sp < ULTRA_SPARSITY_TURN_POINT;
-  }
-
-// ==============================================================================================
-
-// The following methods estimate the memory needed for sparse matrices that are
-// results of operations on other sparse matrices using the cuSparse Library.
-// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
-// op = whether to transpose or not
-// binaryOperation = For cuSparse, +, - are *(matmul) are supported
-
-// From CuSparse Manual,
-// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
-// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
-// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
-// and the total number of nonzero elements. In the second step, the user gathers nnzC
-//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
-// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
-// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
-// to complete matrix C.
-
-  /**
-   * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
-   * the default that works for DGEMM.
-   */
-  private void allocateMatDescrPointer() {
-    this.descr = getDefaultCuSparseMatrixDescriptor();
-  }
-
-  /**
-   * Factory method to allocate an empty CSR Sparse matrix on the GPU
-   *
-   * @param gCtx ?
-   * @param nnz2 number of non-zeroes
-   * @param rows number of rows
-   * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   */
-  public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException {
-    LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
-    assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
-    CSRPointer r = new CSRPointer(gCtx);
-    r.nnz = nnz2;
-    if (nnz2 == 0) {
-      // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
-      // with no memory allocated on the GPU.
-      return r;
-    }
-    gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
-    // increment the cudaCount by 1 for the allocation of all 3 arrays
-    r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
-    r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
-    r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
-    return r;
-  }
-
-  /**
-   * Allocate row pointers of m+1 elements
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param C      Output matrix
-   * @param rowsC  number of rows in C
-   * @throws DMLRuntimeException ?
-   */
-  private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
-    LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
-    cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
-    //cudaDeviceSynchronize;
-    // Do not increment the cudaCount of allocations on GPU
-    C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
-  }
-
-  /**
-   * Determine total number of nonzero element for the cusparseDgeam  operation.
-   * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param B      Sparse Matrix B on GPU
-   * @param C      Output Sparse Matrix C on GPU
-   * @param m      Rows in C
-   * @param n      Columns in C
-   * @throws DMLRuntimeException ?
-   */
-  private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException {
-    LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
-    int[] CnnzArray = {-1};
-    cusparseXcsrgeamNnz(handle, m, n,
-            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
-            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
-            C.descr, C.rowPtr, Pointer.to(CnnzArray));
-    //cudaDeviceSynchronize;
-    if (CnnzArray[0] != -1) {
-      C.nnz = CnnzArray[0];
-    } else {
-      int baseArray[] = {0};
-      cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
-      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
-      C.nnz = CnnzArray[0] - baseArray[0];
-    }
-  }
-
-// ==============================================================================================
-
-  /**
-   * Determine total number of nonzero element for the cusparseDgemm operation.
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param A      Sparse Matrix A on GPU
-   * @param transA op - whether A is transposed
-   * @param B      Sparse Matrix B on GPU
-   * @param transB op - whether B is transposed
-   * @param C      Output Sparse Matrix C on GPU
-   * @param m      Number of rows of sparse matrix op ( A ) and C
-   * @param n      Number of columns of sparse matrix op ( B ) and C
-   * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
-   * @throws DMLRuntimeException ?
-   */
-  private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
-    LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
-    int[] CnnzArray = {-1};
-    if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
-      throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
-    }
-    cusparseXcsrgemmNnz(handle, transA, transB, m, n, k,
-            A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
-            B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
-            C.descr, C.rowPtr, Pointer.to(CnnzArray));
-    //cudaDeviceSynchronize;
-    if (CnnzArray[0] != -1) {
-      C.nnz = CnnzArray[0];
-    } else {
-      int baseArray[] = {0};
-      cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
-      cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
-      C.nnz = CnnzArray[0] - baseArray[0];
-    }
-  }
-
-  /**
-   * Allocate val and index pointers.
-   * @param gCtx   a valid {@link GPUContext}
-   * @param handle a valid {@link cusparseHandle}
-   * @param C      Output sparse matrix on GPU
-   * @throws DMLRuntimeException ?
-   */
-  private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
-    LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
-    // Increment cudaCount by one when all three arrays of CSR sparse array are allocated
-    C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
-    C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
-  }
-
-  /**
-   * Copies this CSR matrix on the GPU to a dense column-major matrix
-   * on the GPU. This is a temporary matrix for operations such as
-   * cusparseDcsrmv.
-   * Since the allocated matrix is temporary, bookkeeping is not updated.
-   * The caller is responsible for calling "free" on the returned Pointer object
-   *
-   * @param cusparseHandle a valid {@link cusparseHandle}
-   * @param cublasHandle   a valid {@link cublasHandle}
-   * @param rows           number of rows in this CSR matrix
-   * @param cols           number of columns in this CSR matrix
-   * @throws DMLRuntimeException if DMLRuntimeException occurs
-   * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
-   */
-  public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
-    LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext());
-    long size = ((long) rows) * getDoubleSizeOf((long) cols);
-    Pointer A = allocate(size);
-    // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
-    if (val != null && rowPtr != null && colInd != null && nnz > 0) {
-      // Note: cusparseDcsr2dense method cannot handle empty blocks
-      cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
-      //cudaDeviceSynchronize;
-    } else {
-      LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
-    }
-    return A;
-  }
-
-  /**
-   * Calls cudaFree lazily on the allocated {@link Pointer} instances
-   * 
-   * @throws DMLRuntimeException ?
-   */
-  public void deallocate() throws DMLRuntimeException {
-    deallocate(false);
-  }
-
-  /**
-   * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
-   *
-   * @param eager whether to do eager or lazy cudaFrees
-   * @throws DMLRuntimeException ?
-   */
-  public void deallocate(boolean eager) throws DMLRuntimeException {
-    if (nnz > 0) {
-      cudaFreeHelper(val, eager);
-      cudaFreeHelper(rowPtr, eager);
-      cudaFreeHelper(colInd, eager);
-    }
-  }
-
-  @Override
-  public String toString() {
-    return "CSRPointer{" +
-            "nnz=" + nnz +
-            '}';
-  }
+	private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
+
+	private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
+	public static cusparseMatDescr matrixDescriptor;
+	/**
+	 * {@link GPUContext} instance to track the GPU to do work on
+	 */
+	private final GPUContext gpuContext;
+	/**
+	 * Number of non zeroes
+	 */
+	public long nnz;
+
+	/**
+	 * double array of non zero values
+	 */
+	public Pointer val;
+
+	/**
+	 * integer array of start of all rows and end of last row + 1
+	 */
+	public Pointer rowPtr;
+
+	/**
+	 * integer array of nnz values' column indices
+	 */
+	public Pointer colInd;
+
+	/**
+	 * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported
+	 */
+	public cusparseMatDescr descr;
+
+	/**
+	 * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
+	 *
+	 * @param gCtx a valid {@link GPUContext}
+	 */
+	private CSRPointer(GPUContext gCtx) {
+		gpuContext = gCtx;
+		val = new Pointer();
+		rowPtr = new Pointer();
+		colInd = new Pointer();
+		allocateMatDescrPointer();
+	}
+
+	private static long getDoubleSizeOf(long numElems) {
+		return numElems * ((long) jcuda.Sizeof.DOUBLE);
+	}
+
+	//  private Pointer allocate(String instName, long size) throws DMLRuntimeException {
+	//    return getGPUContext().allocate(instName, size);
+	//  }
+
+	private static long getIntSizeOf(long numElems) {
+		return numElems * ((long) jcuda.Sizeof.INT);
+	}
+
+	//  private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+	//    getGPUContext().cudaFreeHelper(toFree);
+	//  }
+
+	public static int toIntExact(long l) throws DMLRuntimeException {
+		if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+			throw new DMLRuntimeException("Cannot be cast to int:" + l);
+		}
+		return (int) l;
+	}
+
+	//  private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
+	//    getGPUContext().cudaFreeHelper(instName, toFree, eager);
+	//  }
+
+	/**
+	 * @return Singleton default matrix descriptor object
+	 * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
+	 */
+	public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
+		if (matrixDescriptor == null) {
+			// Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
+			matrixDescriptor = new cusparseMatDescr();
+			cusparseCreateMatDescr(matrixDescriptor);
+			cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
+			cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
+		}
+		return matrixDescriptor;
+	}
+
+	/**
+	 * Estimate the size of a CSR matrix in GPU memory
+	 * Size of pointers is not needed and is not added in
+	 *
+	 * @param nnz2 number of non zeroes
+	 * @param rows number of rows
+	 * @return size estimate
+	 */
+	public static long estimateSize(long nnz2, long rows) {
+		long sizeofValArray = getDoubleSizeOf(nnz2);
+		long sizeofRowPtrArray = getIntSizeOf(rows + 1);
+		long sizeofColIndArray = getIntSizeOf(nnz2);
+		long sizeofDescr = getIntSizeOf(4);
+		// From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
+		// typedef struct {
+		// 	cusparseMatrixType_t MatrixType;
+		//	cusparseFillMode_t FillMode;
+		//	cusparseDiagType_t DiagType;
+		// 	cusparseIndexBase_t IndexBase;
+		// } cusparseMatDescr_t;
+		long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
+		return tot;
+	}
+
+	/**
+	 * Static method to copy a CSR sparse matrix from Host to Device
+	 *
+	 * @param dest   [input] destination location (on GPU)
+	 * @param rows   number of rows
+	 * @param nnz    number of non-zeroes
+	 * @param rowPtr integer array of row pointers
+	 * @param colInd integer array of column indices
+	 * @param values double array of non zero values
+	 */
+	public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+		CSRPointer r = dest;
+		long t0 = 0;
+		if (DMLScript.STATISTICS)
+			t0 = System.nanoTime();
+		r.nnz = nnz;
+		cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
+		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
+		cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
+		if (DMLScript.STATISTICS)
+			GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
+		if (DMLScript.STATISTICS)
+			GPUStatistics.cudaToDevCount.addAndGet(3);
+	}
+
+	/**
+	 * Static method to copy a CSR sparse matrix from Device to host
+	 *
+	 * @param src    [input] source location (on GPU)
+	 * @param rows   [input] number of rows
+	 * @param nnz    [input] number of non-zeroes
+	 * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
+	 * @param colInd [output] pre-allocated integer array of column indices of size nnz
+	 * @param values [output] pre-allocated double array of values of size nnz
+	 */
+	public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+		CSRPointer r = src;
+		long t0 = 0;
+		if (DMLScript.STATISTICS)
+			t0 = System.nanoTime();
+		cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
+		cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
+		cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
+		if (DMLScript.STATISTICS)
+			GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+		if (DMLScript.STATISTICS)
+			GPUStatistics.cudaFromDevCount.addAndGet(3);
+	}
+
+	/**
+	 * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
+	 * C = a op(A) + b op(B)
+	 *
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param A      Sparse Matrix A on GPU
+	 * @param B      Sparse Matrix B on GPU
+	 * @param m      Rows in A
+	 * @param n      Columns in Bs
+	 * @return CSR (compressed sparse row) pointer
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m,
+			int n) throws DMLRuntimeException {
+		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
+		}
+		CSRPointer C = new CSRPointer(gCtx);
+		step1AllocateRowPointers(gCtx, handle, C, m);
+		step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
+		step3AllocateValNInd(gCtx, handle, C);
+		return C;
+	}
+
+	/**
+	 * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
+	 * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
+	 *
+	 * @param gCtx   ?
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param A      Sparse Matrix A on GPU
+	 * @param transA 'T' if A is to be transposed, 'N' otherwise
+	 * @param B      Sparse Matrix B on GPU
+	 * @param transB 'T' if B is to be transposed, 'N' otherwise
+	 * @param m      Rows in A
+	 * @param n      Columns in B
+	 * @param k      Columns in A / Rows in B
+	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
+			CSRPointer B, int transB, int m, int n, int k) throws DMLRuntimeException {
+		// Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
+		// https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+		CSRPointer C = new CSRPointer(gCtx);
+		step1AllocateRowPointers(gCtx, handle, C, m);
+		step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+		step3AllocateValNInd(gCtx, handle, C);
+		return C;
+	}
+
+	/**
+	 * Factory method to allocate an empty CSR Sparse matrix on the GPU
+	 *
+	 * @param gCtx ?
+	 * @param nnz2 number of non-zeroes
+	 * @param rows number of rows
+	 * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException {
+		LOG.trace(
+				"GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
+		assert nnz2
+				> -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
+		CSRPointer r = new CSRPointer(gCtx);
+		r.nnz = nnz2;
+		if (nnz2 == 0) {
+			// The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
+			// with no memory allocated on the GPU.
+			return r;
+		}
+		gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
+		// increment the cudaCount by 1 for the allocation of all 3 arrays
+		r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+		r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
+		r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+		return r;
+	}
+
+	/**
+	 * Allocate row pointers of m+1 elements
+	 *
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param C      Output matrix
+	 * @param rowsC  number of rows in C
+	 * @throws DMLRuntimeException ?
+	 */
+	private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC)
+			throws DMLRuntimeException {
+		LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
+		cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
+		//cudaDeviceSynchronize;
+		// Do not increment the cudaCount of allocations on GPU
+		C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+	}
+
+	/**
+	 * Determine total number of nonzero element for the cusparseDgeam  operation.
+	 * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
+	 *
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param A      Sparse Matrix A on GPU
+	 * @param B      Sparse Matrix B on GPU
+	 * @param C      Output Sparse Matrix C on GPU
+	 * @param m      Rows in C
+	 * @param n      Columns in C
+	 * @throws DMLRuntimeException ?
+	 */
+	private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B,
+			CSRPointer C, int m, int n) throws DMLRuntimeException {
+		LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
+		int[] CnnzArray = { -1 };
+		cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
+				B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
+		//cudaDeviceSynchronize;
+		if (CnnzArray[0] != -1) {
+			C.nnz = CnnzArray[0];
+		} else {
+			int baseArray[] = { 0 };
+			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+					cudaMemcpyDeviceToHost);
+			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+			C.nnz = CnnzArray[0] - baseArray[0];
+		}
+	}
+
+	/**
+	 * Determine total number of nonzero element for the cusparseDgemm operation.
+	 *
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param A      Sparse Matrix A on GPU
+	 * @param transA op - whether A is transposed
+	 * @param B      Sparse Matrix B on GPU
+	 * @param transB op - whether B is transposed
+	 * @param C      Output Sparse Matrix C on GPU
+	 * @param m      Number of rows of sparse matrix op ( A ) and C
+	 * @param n      Number of columns of sparse matrix op ( B ) and C
+	 * @param k      Number of columns/rows of sparse matrix op ( A ) / op ( B )
+	 * @throws DMLRuntimeException ?
+	 */
+	private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
+			CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
+		LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
+		int[] CnnzArray = { -1 };
+		if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+			throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
+		}
+		cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
+				toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
+		//cudaDeviceSynchronize;
+		if (CnnzArray[0] != -1) {
+			C.nnz = CnnzArray[0];
+		} else {
+			int baseArray[] = { 0 };
+			cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+					cudaMemcpyDeviceToHost);
+			cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+			C.nnz = CnnzArray[0] - baseArray[0];
+		}
+	}
+
+	/**
+	 * Allocate val and index pointers.
+	 *
+	 * @param gCtx   a valid {@link GPUContext}
+	 * @param handle a valid {@link cusparseHandle}
+	 * @param C      Output sparse matrix on GPU
+	 * @throws DMLRuntimeException ?
+	 */
+	private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C)
+			throws DMLRuntimeException {
+		LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
+		// Increment cudaCount by one when all three arrays of CSR sparse array are allocated
+		C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+		C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+	}
+
+	// ==============================================================================================
+
+	// The following methods estimate the memory needed for sparse matrices that are
+	// results of operations on other sparse matrices using the cuSparse Library.
+	// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
+	// op = whether to transpose or not
+	// binaryOperation = For cuSparse, +, - are *(matmul) are supported
+
+	// From CuSparse Manual,
+	// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
+	// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
+	// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
+	// and the total number of nonzero elements. In the second step, the user gathers nnzC
+	//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
+	// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
+	// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
+	// to complete matrix C.
+
+	public CSRPointer clone(int rows) throws DMLRuntimeException {
+		CSRPointer me = this;
+		CSRPointer that = new CSRPointer(me.getGPUContext());
+
+		that.allocateMatDescrPointer();
+		long totalSize = estimateSize(me.nnz, rows);
+		that.gpuContext.ensureFreeSpace(totalSize);
+
+		that.nnz = me.nnz;
+		that.val = allocate(that.nnz * Sizeof.DOUBLE);
+		that.rowPtr = allocate(rows * Sizeof.DOUBLE);
+		that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
+
+		cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+		cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+
+		return that;
+	}
+
+	private Pointer allocate(long size) throws DMLRuntimeException {
+		return getGPUContext().allocate(size);
+	}
+
+	private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
+		getGPUContext().cudaFreeHelper(toFree, eager);
+	}
+
+	private GPUContext getGPUContext() {
+		return gpuContext;
+	}
+
+	// ==============================================================================================
+
+	/**
+	 * Check for ultra sparsity
+	 *
+	 * @param rows number of rows
+	 * @param cols number of columns
+	 * @return true if ultra sparse
+	 */
+	public boolean isUltraSparse(int rows, int cols) {
+		double sp = ((double) nnz / rows / cols);
+		return sp < ULTRA_SPARSITY_TURN_POINT;
+	}
+
+	/**
+	 * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
+	 * the default that works for DGEMM.
+	 */
+	private void allocateMatDescrPointer() {
+		this.descr = getDefaultCuSparseMatrixDescriptor();
+	}
+
+	/**
+	 * Copies this CSR matrix on the GPU to a dense column-major matrix
+	 * on the GPU. This is a temporary matrix for operations such as
+	 * cusparseDcsrmv.
+	 * Since the allocated matrix is temporary, bookkeeping is not updated.
+	 * The caller is responsible for calling "free" on the returned Pointer object
+	 *
+	 * @param cusparseHandle a valid {@link cusparseHandle}
+	 * @param cublasHandle   a valid {@link cublasHandle}
+	 * @param rows           number of rows in this CSR matrix
+	 * @param cols           number of columns in this CSR matrix
+	 * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
+	 * @throws DMLRuntimeException if DMLRuntimeException occurs
+	 */
+	public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
+			int cols) throws DMLRuntimeException {
+		LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
+				+ getGPUContext());
+		long size = ((long) rows) * getDoubleSizeOf((long) cols);
+		Pointer A = allocate(size);
+		// If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
+		if (val != null && rowPtr != null && colInd != null && nnz > 0) {
+			// Note: cusparseDcsr2dense method cannot handle empty blocks
+			cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+			//cudaDeviceSynchronize;
+		} else {
+			LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
+		}
+		return A;
+	}
+
+	/**
+	 * Calls cudaFree lazily on the allocated {@link Pointer} instances
+	 *
+	 * @throws DMLRuntimeException ?
+	 */
+	public void deallocate() throws DMLRuntimeException {
+		deallocate(false);
+	}
+
+	/**
+	 * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
+	 *
+	 * @param eager whether to do eager or lazy cudaFrees
+	 * @throws DMLRuntimeException ?
+	 */
+	public void deallocate(boolean eager) throws DMLRuntimeException {
+		if (nnz > 0) {
+			cudaFreeHelper(val, eager);
+			cudaFreeHelper(rowPtr, eager);
+			cudaFreeHelper(colInd, eager);
+		}
+	}
+
+	@Override
+	public String toString() {
+		return "CSRPointer{" + "nnz=" + nnz + '}';
+	}
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index ce5c5ff..ef000c2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -31,25 +31,24 @@ import jcuda.driver.CUstream;
  * Java Wrapper to specify CUDA execution configuration for launching custom kernels
  */
 public class ExecutionConfig {
-	public int gridDimX; 
+	public int gridDimX;
 	public int gridDimY = 1;
 	public int gridDimZ = 1;
-	public int blockDimX; 
+	public int blockDimX;
 	public int blockDimY = 1;
 	public int blockDimZ = 1;
 	public int sharedMemBytes = 0;
 	public CUstream stream = null;
 
-
-	
 	private static HashMap<Integer, Integer> maxBlockDimForDevice = new HashMap<Integer, Integer>();
 
 	/**
 	 * Convenience constructor for setting the number of blocks, number of threads and the
 	 * shared memory size
-	 * @param gridDimX					Number of blocks (for CUDA Kernel)
-	 * @param blockDimX					Number of threads per block (for CUDA Kernel)
-	 * @param sharedMemBytes		Amount of Shared memory (for CUDA Kernel)
+	 *
+	 * @param gridDimX       Number of blocks (for CUDA Kernel)
+	 * @param blockDimX      Number of threads per block (for CUDA Kernel)
+	 * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel)
 	 */
 	public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) {
 		this.gridDimX = gridDimX;
@@ -58,13 +57,13 @@ public class ExecutionConfig {
 	}
 
 	/**
-	 * Use this for simple vector operations and use following in the kernel 
-	 * <code> 
-	 * int index = blockIdx.x * blockDim.x + threadIdx.x 
+	 * Use this for simple vector operations and use following in the kernel
+	 * <code>
+	 * int index = blockIdx.x * blockDim.x + threadIdx.x
 	 * </code>
-	 * 
+	 * <p>
 	 * This tries to schedule as minimum grids as possible.
-	 * 
+	 *
 	 * @param numCells number of cells
 	 * @return execution configuration
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
@@ -72,19 +71,19 @@ public class ExecutionConfig {
 	public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) throws DMLRuntimeException {
 		int deviceNumber = 0;
 		int blockDimX = getMaxBlockDim(deviceNumber);
-		int gridDimX = (int)Math.ceil((double)numCells / blockDimX);
+		int gridDimX = (int) Math.ceil((double) numCells / blockDimX);
 		return new ExecutionConfig(gridDimX, blockDimX);
 	}
-	
+
 	/**
-	 * Use this for simple matrix operations and use following in the kernel 
-	 * <code> 
+	 * Use this for simple matrix operations and use following in the kernel
+	 * <code>
 	 * int ix = blockIdx.x * blockDim.x + threadIdx.x;
 	 * int iy = blockIdx.y * blockDim.y + threadIdx.y;
 	 * </code>
-	 * 
+	 * <p>
 	 * This tries to schedule as minimum grids as possible.
-	 * 
+	 *
 	 * @param rlen number of rows
 	 * @param clen number of columns
 	 * @return execution configuration
@@ -94,45 +93,45 @@ public class ExecutionConfig {
 		int deviceNumber = 0;
 		int maxBlockDim = getMaxBlockDim(deviceNumber);
 		int blockDimX = (int) Math.min(maxBlockDim, rlen);
-		int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
-		int blockDimY = (int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen);
-		int gridDimY = (int)Math.ceil((double)clen / blockDimY);
+		int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
+		int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen);
+		int gridDimY = (int) Math.ceil((double) clen / blockDimY);
 		return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
 	}
-	
+
 	public ExecutionConfig(int gridDimX, int blockDimX) {
 		this.gridDimX = gridDimX;
 		this.blockDimX = blockDimX;
 	}
-	
+
 	public ExecutionConfig(int gridDimX, int gridDimY, int blockDimX, int blockDimY) {
 		this.gridDimX = gridDimX;
 		this.gridDimY = gridDimY;
 		this.blockDimX = blockDimX;
 		this.blockDimY = blockDimY;
 	}
-	
-	
+
 	/**
-     * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
-	 * 
+	 * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
+	 *
 	 * @param deviceNumber device number of the given device
 	 * @return The maximum block dimension, in x-direction
 	 * @throws DMLRuntimeException if DMLRuntimeException occurs
 	 */
-    private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException {
-//    	return 32;
-    	// TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
-    	Integer ret = maxBlockDimForDevice.get(deviceNumber);
-    	if(ret == null) {
-    		CUdevice device = new CUdevice();
-            JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber));
-            int maxBlockDimX[] =  {0};
-            jcuda.driver.JCudaDriver.cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
-            maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
-            return maxBlockDimX[0];
-    	}
-        return ret;
-    }
-    
-    }
+	private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException {
+		//    	return 32;
+		// TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
+		Integer ret = maxBlockDimForDevice.get(deviceNumber);
+		if (ret == null) {
+			CUdevice device = new CUdevice();
+			JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber));
+			int maxBlockDimX[] = { 0 };
+			jcuda.driver.JCudaDriver
+					.cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
+			maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
+			return maxBlockDimX[0];
+		}
+		return ret;
+	}
+
+}