You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/08/24 21:41:33 UTC
[3/5] systemml git commit: [SYSTEMML-1793] Support matrix range indexing on GPU

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/hops/IndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java
index 5f2ce34..6a3ddf4 100644
--- a/src/main/java/org/apache/sysml/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.AggBinaryOp.SparkAggType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
@@ -97,7 +98,13 @@ public class IndexingOp extends Hop
 	
 	@Override
 	public boolean isGPUEnabled() {
-		return false;
+		if(!DMLScript.USE_ACCELERATOR) {
+			return false;
+		}
+		else {
+			// only matrix indexing is supported on GPU
+			return (getDataType() == DataType.MATRIX);
+		}
 	}
 
 	@Override
@@ -172,7 +179,7 @@ public class IndexingOp extends Hop
 					setLineNumbers(reindex);
 					setLops(reindex);
 				}
-				else //CP
+				else //CP or GPU
 				{
 					Lop dummy = Data.createLiteralLop(ValueType.INT, Integer.toString(-1));
 					RangeBasedReIndex reindex = new RangeBasedReIndex(

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 36f57b4..8abfc69 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -27,6 +27,7 @@ import org.apache.sysml.runtime.instructions.gpu.BuiltinBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.MatrixIndexingGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
@@ -128,6 +129,10 @@ public class GPUInstructionParser  extends InstructionParser
 		String2GPUInstructionType.put( ">"    , GPUINSTRUCTION_TYPE.RelationalBinary);
 		String2GPUInstructionType.put( "<="   , GPUINSTRUCTION_TYPE.RelationalBinary);
 		String2GPUInstructionType.put( ">="   , GPUINSTRUCTION_TYPE.RelationalBinary);
+		
+		// Indexing 
+		// right indexing: output = X[1:3, 4:5]
+		String2GPUInstructionType.put( "rangeReIndex", GPUINSTRUCTION_TYPE.MatrixIndexing); 
 	}
 	
 	public static GPUInstruction parseSingleInstruction (String str ) 
@@ -187,6 +192,9 @@ public class GPUInstructionParser  extends InstructionParser
 			case RelationalBinary:
 				return RelationalBinaryGPUInstruction.parseInstruction(str);
 
+			case MatrixIndexing:
+				return MatrixIndexingGPUInstruction.parseInstruction(str);
+				
 			default: 
 				throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype );
 		}

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index b962eb7..0f0b28e 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -43,7 +43,8 @@ public abstract class GPUInstruction extends Instruction
 		ArithmeticBinary,
 		BuiltinUnary,
 		BuiltinBinary,
-		Builtin
+		Builtin,
+		MatrixIndexing
 	};
 
 	// Memory/conversions
@@ -112,6 +113,9 @@ public abstract class GPUInstruction extends Instruction
 	public final static String MISC_TIMER_REDUCE_ALL_KERNEL =                "rallk";  // time spent in reduce all kernel
 	public final static String MISC_TIMER_REDUCE_ROW_KERNEL =                "rrowk";  // time spent in reduce row kernel
 	public final static String MISC_TIMER_REDUCE_COL_KERNEL =                "rcolk";  // time spent in reduce column kernel
+	
+	public final static String MISC_TIMER_RIX_DENSE_OP =                     "drix";    // time spent in the right indexing dense kernel
+	public final static String MISC_TIMER_RIX_SPARSE_DENSE_OP =              "sdrix";   // time spent in the right indexing sparse dense kernel
 
 	// Deep learning operators
 	public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =         "nnaf";  // time spent in cudnnActivationForward

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
new file mode 100644
index 0000000..5e2c8fc
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.SimpleOperator;
+import org.apache.sysml.runtime.util.IndexRange;
+import org.apache.sysml.utils.GPUStatistics;
+
+public class MatrixIndexingGPUInstruction extends GPUInstruction {
+	CPOperand rowLower, rowUpper, colLower, colUpper;
+	CPOperand input1; CPOperand input2; CPOperand output;
+	
+	public MatrixIndexingGPUInstruction(Operator op, CPOperand in, 
+			CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, CPOperand out, String opcode, String istr){
+		super(op, opcode, istr);
+		_gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
+		rowLower = rl;
+		rowUpper = ru;
+		colLower = cl;
+		colUpper = cu;
+		input1 = in;
+		output = out;
+	}
+	
+	public MatrixIndexingGPUInstruction(Operator op, CPOperand lhsInput, CPOperand rhsInput, 
+			CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, CPOperand out, String opcode, String istr){
+		super(op, opcode, istr);
+		_gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
+		rowLower = rl;
+		rowUpper = ru;
+		colLower = cl;
+		colUpper = cu;
+		input1 = lhsInput;
+		input2 = rhsInput;
+		output = out;
+	}
+	
+	public static MatrixIndexingGPUInstruction parseInstruction ( String str ) throws DMLRuntimeException {
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		String opcode = parts[0];
+		
+		if ( opcode.equalsIgnoreCase("rangeReIndex") ) {
+			if ( parts.length == 7 ) {
+				// Example: rangeReIndex:mVar1:Var2:Var3:Var4:Var5:mVar6
+				CPOperand in, rl, ru, cl, cu, out;
+				in = new CPOperand();
+				rl = new CPOperand();
+				ru = new CPOperand();
+				cl = new CPOperand();
+				cu = new CPOperand();
+				out = new CPOperand();
+				in.split(parts[1]);
+				rl.split(parts[2]);
+				ru.split(parts[3]);
+				cl.split(parts[4]);
+				cu.split(parts[5]);
+				out.split(parts[6]);
+				if( in.getDataType()==DataType.MATRIX )
+					return new MatrixIndexingGPUInstruction(new SimpleOperator(null), in, rl, ru, cl, cu, out, opcode, str);
+				else 
+					throw new DMLRuntimeException("Can index only on Matrices in GPU");
+			}
+			else {
+				throw new DMLRuntimeException("Invalid number of operands in instruction: " + str);
+			}
+		} 
+		else if ( opcode.equalsIgnoreCase("leftIndex")) {
+			if ( parts.length == 8 ) {
+				// Example: leftIndex:mVar1:mvar2:Var3:Var4:Var5:Var6:mVar7
+				CPOperand lhsInput, rhsInput, rl, ru, cl, cu, out;
+				lhsInput = new CPOperand();
+				rhsInput = new CPOperand();
+				rl = new CPOperand();
+				ru = new CPOperand();
+				cl = new CPOperand();
+				cu = new CPOperand();
+				out = new CPOperand();
+				lhsInput.split(parts[1]);
+				rhsInput.split(parts[2]);
+				rl.split(parts[3]);
+				ru.split(parts[4]);
+				cl.split(parts[5]);
+				cu.split(parts[6]);
+				out.split(parts[7]);
+				if( lhsInput.getDataType()==DataType.MATRIX )
+					return new MatrixIndexingGPUInstruction(new SimpleOperator(null), lhsInput, rhsInput, rl, ru, cl, cu, out, opcode, str);
+				else 
+					throw new DMLRuntimeException("Can index only on Matrices in GPU");
+			}
+			else {
+				throw new DMLRuntimeException("Invalid number of operands in instruction: " + str);
+			}
+		}
+		else {
+			throw new DMLRuntimeException("Unknown opcode while parsing a MatrixIndexingGPUInstruction: " + str);
+		}
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec)
+			throws DMLRuntimeException {
+		GPUStatistics.incrementNoOfExecutedGPUInst();
+		String opcode = getOpcode();
+		
+		IndexRange ixrange = getIndexRange(ec);
+		if ( opcode.equalsIgnoreCase("rangeReIndex") ) {
+			MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, input1.getName());
+			LibMatrixCUDA.sliceOperations(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, ixrange, output.getName());
+			ec.releaseMatrixInputForGPUInstruction(input1.getName());
+			ec.releaseMatrixOutputForGPUInstruction(output.getName());
+		}
+		else {
+			throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
+		}
+	}
+	
+	IndexRange getIndexRange(ExecutionContext ec) throws DMLRuntimeException {
+		return new IndexRange( //rl, ru, cl, ru
+			(int)(ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue()-1),
+			(int)(ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue()-1),
+			(int)(ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue()-1),
+			(int)(ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue()-1));		
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 7244938..a5bc299 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -49,6 +49,18 @@ import jcuda.jcusparse.cusparsePointerMode;
 /**
  * Compressed Sparse Row (CSR) format for CUDA
  * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations
+ * 
+ * Since we assume that the matrix is stored with zero-based indexing (i.e. CUSPARSE_INDEX_BASE_ZERO),
+ * the matrix
+ * 1.0 4.0 0.0 0.0 0.0 
+ * 0.0 2.0 3.0 0.0 0.0 
+ * 5.0 0.0 0.0 7.0 8.0 
+ * 0.0 0.0 9.0 0.0 6.0
+ * 
+ * is stored as
+ * val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0 
+ * rowPtr = 0.0 2.0 4.0 7.0 9.0 
+ * colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0
  */
 public class CSRPointer {
 
@@ -184,9 +196,9 @@ public class CSRPointer {
 		cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
 		cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
+			GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaToDevCount.addAndGet(3);
+			GPUStatistics.cudaToDevCount.add(3);
 	}
 
 	/**
@@ -208,9 +220,9 @@ public class CSRPointer {
 		cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
 		cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+			GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevCount.addAndGet(3);
+			GPUStatistics.cudaFromDevCount.add(3);
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index f107f47..84d181b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -147,7 +147,7 @@ public class GPUContext {
 
 		if (DMLScript.STATISTICS)
 			GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
-		
+
 		LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
 				+ this);
 
@@ -269,7 +269,7 @@ public class GPUContext {
 				freeCUDASpaceMap.remove(size);
 			if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
 				GPUStatistics
-						.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
+				.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
 		} else {
 			LOG.trace(
 					"GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size
@@ -280,9 +280,9 @@ public class GPUContext {
 			A = new Pointer();
 			cudaMalloc(A, size);
 			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+				GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0);
 			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+				GPUStatistics.cudaAllocCount.add(statsCount);
 			if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
 				GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE,
 						System.nanoTime() - t0);
@@ -298,9 +298,9 @@ public class GPUContext {
 		if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
 			GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+			GPUStatistics.cudaMemSet0Time.add(end - t1);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+			GPUStatistics.cudaMemSet0Count.add(1);
 		cudaBlockSizeMap.put(A, size);
 		return A;
 
@@ -349,32 +349,32 @@ public class GPUContext {
 		long t0 = 0;
 		assert cudaBlockSizeMap.containsKey(
 				toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up";
-		long size = cudaBlockSizeMap.get(toFree);
-		if (eager) {
-			LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on "
-					+ this);
-			if (DMLScript.STATISTICS)
-				t0 = System.nanoTime();
-			cudaFree(toFree);
-			cudaBlockSizeMap.remove(toFree);
-			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
-			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaDeAllocCount.addAndGet(1);
-			if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
-				GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE,
-						System.nanoTime() - t0);
-		} else {
-			LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
-			LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
-			if (freeList == null) {
-				freeList = new LinkedList<Pointer>();
-				freeCUDASpaceMap.put(size, freeList);
-			}
-			if (freeList.contains(toFree))
-				throw new RuntimeException("GPU : Internal state corrupted, double free");
-			freeList.add(toFree);
-		}
+				long size = cudaBlockSizeMap.get(toFree);
+				if (eager) {
+					LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on "
+							+ this);
+					if (DMLScript.STATISTICS)
+						t0 = System.nanoTime();
+					cudaFree(toFree);
+					cudaBlockSizeMap.remove(toFree);
+					if (DMLScript.STATISTICS)
+						GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
+					if (DMLScript.STATISTICS)
+						GPUStatistics.cudaDeAllocCount.add(1);
+					if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
+						GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE,
+								System.nanoTime() - t0);
+				} else {
+					LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this);
+					LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size);
+					if (freeList == null) {
+						freeList = new LinkedList<Pointer>();
+						freeCUDASpaceMap.put(size, freeList);
+					}
+					if (freeList.contains(toFree))
+						throw new RuntimeException("GPU : Internal state corrupted, double free");
+					freeList.add(toFree);
+				}
 	}
 
 	/**
@@ -426,7 +426,7 @@ public class GPUContext {
 	 */
 	protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException {
 		LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this);
-		GPUStatistics.cudaEvictionCount.addAndGet(1);
+		GPUStatistics.cudaEvictionCount.add(1);
 		// Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap
 		// to free up space
 		LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap;

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 94ceb36..c3e23f3 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -32,9 +32,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import java.util.Arrays;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -100,12 +98,12 @@ public class GPUObject {
 	/**
 	 * number of read/write locks on this object (this GPUObject is being used in a current instruction)
 	 */
-	protected AtomicInteger locks = new AtomicInteger(0);
+	protected AtomicLong locks = new AtomicLong();
 
 	/**
 	 * Timestamp, needed by {@link GPUContext#evict(long)}
 	 */
-	AtomicLong timestamp = new AtomicLong(0);
+	AtomicLong timestamp = new AtomicLong();
 
 	/**
 	 * Whether this block is in sparse format
@@ -131,7 +129,7 @@ public class GPUObject {
 			that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
 		}
 		that.dirty = me.dirty;
-		that.locks = new AtomicInteger(me.locks.get());
+		that.locks = new AtomicLong(me.locks.get());
 		that.timestamp = new AtomicLong(me.timestamp.get());
 		that.isSparse = me.isSparse;
 
@@ -238,9 +236,9 @@ public class GPUObject {
 			t2 = System.nanoTime();
 		cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+			GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t2);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaFromDevCount.addAndGet(1);
+			GPUStatistics.cudaFromDevCount.add(1);
 
 		if (nnzC[0] == -1) {
 			throw new DMLRuntimeException(
@@ -353,9 +351,9 @@ public class GPUObject {
 						cols));
 		// TODO: What if mat.getNnz() is -1 ?
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
+			GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
+			GPUStatistics.cudaDenseToSparseCount.add(1);
 	}
 
 	/**
@@ -430,9 +428,9 @@ public class GPUObject {
 		if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
 			GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start);
+			GPUStatistics.cudaSparseToDenseTime.add(end - start);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
+			GPUStatistics.cudaSparseToDenseCount.add(1);
 	}
 
 	/**
@@ -560,8 +558,8 @@ public class GPUObject {
 		// If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0
 		if (v != 0)
 			getGPUContext().getKernels()
-					.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
-							getJcudaDenseMatrixPtr(), v, numElems);
+			.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+					getJcudaDenseMatrixPtr(), v, numElems);
 	}
 
 	/**
@@ -657,12 +655,12 @@ public class GPUObject {
 	 * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext}
 	 */
 	private void updateReleaseLocks(int l) throws DMLRuntimeException {
-		int newLocks = locks.addAndGet(l);
+		int newLocks = (int) locks.addAndGet(l);
 		if (newLocks < 0) {
 			throw new CacheException("Internal state error : Invalid number of locks on a GPUObject");
 		}
 
-		LOG.trace("GPU : updateReleaseLocks, new number of locks is " + locks.get() + ", on " + this + ", GPUContext="
+		LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext="
 				+ getGPUContext());
 		GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy;
 		switch (evictionPolicy) {
@@ -802,18 +800,18 @@ public class GPUObject {
 					csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(),
 							cooBlock.indexes(), cooBlock.values());
 					if (DMLScript.STATISTICS)
-						GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+						GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
 					if (DMLScript.STATISTICS)
-						GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+						GPUStatistics.cudaSparseConversionCount.increment();
 				} else if (block instanceof SparseBlockMCSR) {
 					if (DMLScript.STATISTICS)
 						t0 = System.nanoTime();
 					SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block;
 					csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
 					if (DMLScript.STATISTICS)
-						GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+						GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
 					if (DMLScript.STATISTICS)
-						GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+						GPUStatistics.cudaSparseConversionCount.increment();
 				} else {
 					throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
 				}
@@ -848,9 +846,9 @@ public class GPUObject {
 		mat.release();
 
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - start);
+			GPUStatistics.cudaToDevTime.add(System.nanoTime() - start);
 		if (DMLScript.STATISTICS)
-			GPUStatistics.cudaToDevCount.addAndGet(1);
+			GPUStatistics.cudaToDevCount.add(1);
 	}
 
 	public static int toIntExact(long l) throws DMLRuntimeException {
@@ -882,9 +880,9 @@ public class GPUObject {
 			mat.release();
 
 			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+				GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
 			if (DMLScript.STATISTICS)
-				GPUStatistics.cudaFromDevCount.addAndGet(1);
+				GPUStatistics.cudaFromDevCount.add(1);
 		} else if (getJcudaSparseMatrixPtr() != null) {
 			if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat))
 				throw new DMLRuntimeException(
@@ -912,9 +910,9 @@ public class GPUObject {
 				mat.acquireModify(tmp);
 				mat.release();
 				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+					GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
 				if (DMLScript.STATISTICS)
-					GPUStatistics.cudaFromDevCount.addAndGet(1);
+					GPUStatistics.cudaFromDevCount.add(1);
 			}
 		} else {
 			throw new DMLRuntimeException(