You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/06/10 19:07:11 UTC
[3/3] systemml git commit: [FIX] Fixed nested parfor for GPUs
[FIX] Fixed nested parfor for GPUs
Additionally
- Fixed intellij codestyle accordingly
- Fixed formatting of some GPU related source files
Closes #532
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f5871756
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f5871756
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f5871756
Branch: refs/heads/master
Commit: f587175647a84a3825b174b4d29c0398be17331f
Parents: 0bcae49
Author: Nakul Jindal <na...@gmail.com>
Authored: Sat Jun 10 12:06:47 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Sat Jun 10 12:06:47 2017 -0700
----------------------------------------------------------------------
dev/code-style/systemml-style-intellij.xml | 37 +-
.../apache/sysml/api/ScriptExecutorUtils.java | 17 +-
.../controlprogram/ParForProgramBlock.java | 19 +-
.../context/ExecutionContext.java | 64 +-
.../controlprogram/parfor/LocalParWorker.java | 2 +-
.../cp/FunctionCallCPInstruction.java | 12 +-
.../gpu/AggregateBinaryGPUInstruction.java | 4 +-
.../gpu/AggregateUnaryGPUInstruction.java | 2 +-
.../gpu/ConvolutionGPUInstruction.java | 18 +-
.../instructions/gpu/MMTSJGPUInstruction.java | 2 +-
.../gpu/MatrixBuiltinGPUInstruction.java | 30 +-
.../MatrixMatrixArithmeticGPUInstruction.java | 2 +-
.../gpu/MatrixMatrixAxpyGPUInstruction.java | 2 +-
.../gpu/MatrixMatrixBuiltinGPUInstruction.java | 2 +-
.../instructions/gpu/ReorgGPUInstruction.java | 2 +-
.../ScalarMatrixArithmeticGPUInstruction.java | 2 +-
.../instructions/gpu/context/CSRPointer.java | 922 ++++++-------
.../gpu/context/ExecutionConfig.java | 85 +-
.../instructions/gpu/context/GPUContext.java | 1257 +++++++++---------
.../gpu/context/GPUContextPool.java | 266 ++--
.../instructions/gpu/context/GPUObject.java | 454 ++++---
.../instructions/gpu/context/JCudaKernels.java | 141 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 42 +-
.../runtime/matrix/data/LibMatrixDNNHelper.java | 1 +
.../org/apache/sysml/test/gpu/GPUTests.java | 47 +-
.../sysml/test/gpu/NeuralNetworkOpTests.java | 106 +-
26 files changed, 1917 insertions(+), 1621 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml b/dev/code-style/systemml-style-intellij.xml
index 1ad3209..b4a53b4 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -1,28 +1,27 @@
<!--
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
-->
-
<code_scheme name="SystemML Format">
<option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
<option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
<option name="IMPORT_LAYOUT_TABLE">
<value>
- <package name="" withSubpackages="true" static="false" />
+ <package name="" withSubpackages="true" static="true" />
<emptyLine />
<package name="java" withSubpackages="true" static="false" />
<emptyLine />
@@ -32,7 +31,7 @@
<emptyLine />
<package name="com" withSubpackages="true" static="false" />
<emptyLine />
- <package name="" withSubpackages="true" static="true" />
+ <package name="" withSubpackages="true" static="false" />
</value>
</option>
<codeStyleSettings language="JAVA">
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 674a011..2895aa4 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -19,6 +19,8 @@
package org.apache.sysml.api;
+import java.util.List;
+
import org.apache.sysml.api.mlcontext.ScriptExecutor;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.hops.codegen.SpoofCompiler;
@@ -79,23 +81,22 @@ public class ScriptExecutorUtils {
// GPUs
GPUContextPool.PER_PROCESS_MAX_GPUS = dmlconf.getIntValue(DMLConfig.MAX_GPUS_PER_PROCESS);
Statistics.startRunTimer();
- GPUContext gCtx = null;
try {
// run execute (w/ exception handling to ensure proper shutdown)
if (DMLScript.USE_ACCELERATOR && ec != null) {
- gCtx = GPUContextPool.getFromPool();
- if (gCtx == null) {
+ List<GPUContext> gCtxs = GPUContextPool.reserveAllGPUContexts();
+ if (gCtxs == null) {
throw new DMLRuntimeException(
"GPU : Could not create GPUContext, either no GPU or all GPUs currently in use");
}
- gCtx.initializeThread();
- ec.setGPUContext(gCtx);
+ gCtxs.get(0).initializeThread();
+ ec.setGPUContexts(gCtxs);
}
rtprog.execute(ec);
} finally { // ensure cleanup/shutdown
- if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != null) {
- ec.getGPUContext().clearTemporaryMemory();
- GPUContextPool.returnToPool(ec.getGPUContext());
+ if (DMLScript.USE_ACCELERATOR && !ec.getGPUContexts().isEmpty()) {
+ ec.getGPUContexts().forEach(gCtx -> gCtx.clearTemporaryMemory());
+ GPUContextPool.freeAllGPUContexts();
}
if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))
SpoofCompiler.cleanupCodeGenerator();
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index c9dcc22..95e28e7 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -632,9 +632,7 @@ public class ParForProgramBlock extends ForProgramBlock
{
case LOCAL: //create parworkers as local threads
if (DMLScript.USE_ACCELERATOR) {
- GPUContextPool.returnToPool(ec.getGPUContext());
- ec.setGPUContext(null);
- setDegreeOfParallelism(GPUContextPool.getDeviceCount());
+ setDegreeOfParallelism(ec.getNumGPUContexts());
}
executeLocalParFor(ec, iterVar, from, to, incr);
break;
@@ -757,7 +755,7 @@ public class ParForProgramBlock extends ForProgramBlock
{
//create parallel workers as (lazy) deep copies
//including preparation of update-in-place variables
- workers[i] = createParallelWorker( _pwIDs[i], queue, ec );
+ workers[i] = createParallelWorker( _pwIDs[i], queue, ec, i);
threads[i] = new Thread( workers[i] );
threads[i].setPriority(Thread.MAX_PRIORITY);
}
@@ -833,11 +831,9 @@ public class ParForProgramBlock extends ForProgramBlock
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
for (int i = 0; i < _numThreads; i++) {
- GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
- GPUContextPool.returnToPool(gCtx);
+ workers[i].getExecutionContext().setGPUContexts(null);
}
- ec.setGPUContext(GPUContextPool.getFromPool());
- ec.getGPUContext().initializeThread();
+ ec.getGPUContext(0).initializeThread();
}
}
finally
@@ -1386,10 +1382,11 @@ public class ParForProgramBlock extends ForProgramBlock
* @param pwID parworker id
* @param queue task queue
* @param ec execution context
+ * @param index the index of the worker
* @return local parworker
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec)
+ private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index)
throws DMLRuntimeException
{
LocalParWorker pw = null;
@@ -1420,9 +1417,9 @@ public class ParForProgramBlock extends ForProgramBlock
ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
// If GPU mode is enabled, gets a GPUContext from the pool of GPUContexts
- // and sets it in the ExecutionContext
+ // and sets it in the ExecutionContext of the parfor
if (DMLScript.USE_ACCELERATOR){
- cpEc.setGPUContext(GPUContextPool.getFromPool());
+ cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
}
//prepare basic update-in-place variables (vars dropped on result merge)
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 735f394..fb179f5 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -20,7 +20,9 @@
package org.apache.sysml.runtime.controlprogram.context;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.HashMap;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -66,7 +68,10 @@ public class ExecutionContext {
//debugging (optional)
protected DebugState _dbState = null;
- protected GPUContext _gpuContext = null;
+ /**
+ * List of {@link GPUContext}s owned by this {@link ExecutionContext}
+ */
+ protected List<GPUContext> _gpuContexts = new ArrayList<>();
protected ExecutionContext()
{
@@ -99,13 +104,42 @@ public class ExecutionContext {
_variables = vars;
}
- public GPUContext getGPUContext() {
- return _gpuContext;
+ /**
+ * Get the i-th GPUContext
+ * @param index index of the GPUContext
+ * @return a valid GPUContext or null if the indexed GPUContext does not exist.
+ */
+ public GPUContext getGPUContext(int index) {
+ try {
+ return _gpuContexts.get(index);
+ } catch (IndexOutOfBoundsException e){
+ return null;
+ }
}
- public void setGPUContext(GPUContext _gpuContext) {
- this._gpuContext = _gpuContext;
- }
+ /**
+ * Sets the list of GPUContexts
+ * @param gpuContexts a collection of GPUContexts
+ */
+ public void setGPUContexts(List<GPUContext> gpuContexts){
+ _gpuContexts = gpuContexts;
+ }
+
+ /**
+ * Gets the list of GPUContexts
+ * @return a list of GPUContexts
+ */
+ public List<GPUContext> getGPUContexts() {
+ return _gpuContexts;
+ }
+
+ /**
+ * Gets the number of GPUContexts
+ * @return number of GPUContexts
+ */
+ public int getNumGPUContexts() {
+ return _gpuContexts.size();
+ }
/* -------------------------------------------------------
* Methods to handle variables and associated data objects
@@ -238,7 +272,7 @@ public class ExecutionContext {
throws DMLRuntimeException
{
MatrixObject mo = allocateGPUMatrixObject(varName);
- boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifyDense();
+ boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifyDense();
mo.getMatrixCharacteristics().setNonZeros(-1);
return new Pair<MatrixObject, Boolean>(mo, allocated);
}
@@ -257,7 +291,7 @@ public class ExecutionContext {
{
MatrixObject mo = allocateGPUMatrixObject(varName);
mo.getMatrixCharacteristics().setNonZeros(nnz);
- boolean allocated = mo.getGPUObject(getGPUContext()).acquireDeviceModifySparse();
+ boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse();
return new Pair<MatrixObject, Boolean>(mo, allocated);
}
@@ -269,12 +303,12 @@ public class ExecutionContext {
*/
public MatrixObject allocateGPUMatrixObject(String varName) throws DMLRuntimeException {
MatrixObject mo = getMatrixObject(varName);
- if( mo.getGPUObject(getGPUContext()) == null ) {
- GPUObject newGObj = getGPUContext().createGPUObject(mo);
+ if( mo.getGPUObject(getGPUContext(0)) == null ) {
+ GPUObject newGObj = getGPUContext(0).createGPUObject(mo);
// The lock is added here for an output block
// so that any block currently in use is not deallocated by eviction on the GPU
newGObj.addLock();
- mo.setGPUObject(getGPUContext(), newGObj);
+ mo.setGPUObject(getGPUContext(0), newGObj);
}
return mo;
}
@@ -282,7 +316,7 @@ public class ExecutionContext {
public Pair<MatrixObject, Boolean> getMatrixInputForGPUInstruction(String varName)
throws DMLRuntimeException
{
- GPUContext gCtx = getGPUContext();
+ GPUContext gCtx = getGPUContext(0);
boolean copied = false;
MatrixObject mo = getMatrixObject(varName);
if(mo == null) {
@@ -322,7 +356,7 @@ public class ExecutionContext {
throws DMLRuntimeException
{
MatrixObject mo = getMatrixObject(varName);
- mo.getGPUObject(getGPUContext()).releaseInput();
+ mo.getGPUObject(getGPUContext(0)).releaseInput();
}
/**
@@ -374,10 +408,10 @@ public class ExecutionContext {
public void releaseMatrixOutputForGPUInstruction(String varName) throws DMLRuntimeException {
MatrixObject mo = getMatrixObject(varName);
- if(mo.getGPUObject(getGPUContext()) == null || !mo.getGPUObject(getGPUContext()).isAllocated()) {
+ if(mo.getGPUObject(getGPUContext(0)) == null || !mo.getGPUObject(getGPUContext(0)).isAllocated()) {
throw new DMLRuntimeException("No output is allocated on GPU");
}
- mo.getGPUObject(getGPUContext()).releaseOutput();
+ mo.getGPUObject(getGPUContext(0)).releaseOutput();
}
public void setMatrixOutput(String varName, MatrixBlock outputData)
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index c4684ec..636b1f8 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -83,7 +83,7 @@ public class LocalParWorker extends ParWorker implements Runnable
// Initialize this GPUContext to this thread
if (DMLScript.USE_ACCELERATOR)
- _ec.getGPUContext().initializeThread();
+ _ec.getGPUContext(0).initializeThread();
//setup compiler config for worker thread
ConfigurationManager.setLocalConfig(_cconf);
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 9cc6bb2..3cd2633 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -168,9 +168,9 @@ public class FunctionCallCPInstruction extends CPInstruction
// and copy the function arguments into the created table.
ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram());
if (DMLScript.USE_ACCELERATOR) {
- fn_ec.setGPUContext(ec.getGPUContext());
- ec.setGPUContext(null);
- fn_ec.getGPUContext().initializeThread();
+ fn_ec.setGPUContexts(ec.getGPUContexts());
+ ec.setGPUContexts(null);
+ fn_ec.getGPUContext(0).initializeThread();
}
fn_ec.setVariables(functionVariables);
// execute the function block
@@ -206,9 +206,9 @@ public class FunctionCallCPInstruction extends CPInstruction
ec.unpinVariables(_boundInputParamNames, pinStatus);
if (DMLScript.USE_ACCELERATOR) {
- ec.setGPUContext(fn_ec.getGPUContext());
- fn_ec.setGPUContext(null);
- ec.getGPUContext().initializeThread();
+ ec.setGPUContexts(fn_ec.getGPUContexts());
+ fn_ec.setGPUContexts(null);
+ ec.getGPUContext(0).initializeThread();
}
// add the updated binding for each return variable to the variables in original symbol table
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 2531c17..0c0a4b2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -96,7 +96,7 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction
int clen = (int) (_isRightTransposed ? m2.getNumRows() : m2.getNumColumns());
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.matmult(ec, ec.getGPUContext(), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
+ LibMatrixCUDA.matmult(ec, ec.getGPUContext(0), getExtendedOpcode(), m1, m2, _output.getName(), _isLeftTransposed, _isRightTransposed);
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
@@ -113,6 +113,6 @@ public class AggregateBinaryGPUInstruction extends GPUInstruction
@SuppressWarnings("unused")
private boolean isSparse(ExecutionContext ec, String var) throws DMLRuntimeException {
MatrixObject mo = ec.getMatrixObject(var);
- return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(), mo);
+ return LibMatrixCUDA.isInSparseFormat(ec.getGPUContext(0), mo);
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
index 8bdd397..5d01820 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateUnaryGPUInstruction.java
@@ -93,7 +93,7 @@ public class AggregateUnaryGPUInstruction extends GPUInstruction {
ec.setMetaData(_output.getName(), rlen, 1);
}
- LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr);
+ LibMatrixCUDA.unaryAggregate(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), (AggregateUnaryOperator)_optr);
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 9d4cd1f..e5ea097 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -186,9 +186,9 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns());
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
if(instOpcode.equalsIgnoreCase("bias_add"))
- LibMatrixCUDA.biasAdd(ec.getGPUContext(), getExtendedOpcode(), input, bias, out);
+ LibMatrixCUDA.biasAdd(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out);
else if(instOpcode.equalsIgnoreCase("bias_multiply"))
- LibMatrixCUDA.biasMultiply(ec.getGPUContext(), getExtendedOpcode(), input, bias, out);
+ LibMatrixCUDA.biasMultiply(ec.getGPUContext(0), getExtendedOpcode(), input, bias, out);
// release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -202,7 +202,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
ec.setMetaData(_output.getName(), input.getNumRows(), input.getNumColumns());
- LibMatrixCUDA.reluBackward(ec.getGPUContext(), getExtendedOpcode(), input, dout, out);
+ LibMatrixCUDA.reluBackward(ec.getGPUContext(0), getExtendedOpcode(), input, dout, out);
// release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
@@ -253,7 +253,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), N, K * P * Q);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2d(ec.getGPUContext(), getExtendedOpcode(), image, filter, out, N, C, H, W,
+ LibMatrixCUDA.conv2d(ec.getGPUContext(0), getExtendedOpcode(), image, filter, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -268,7 +268,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), N, K * P * Q);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBiasAdd(ec.getGPUContext(0), getExtendedOpcode(), image, bias, filter, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) {
@@ -283,7 +283,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), K, C * R * S);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBackwardFilter(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
// TODO: For now always copy the device data to host
// ec.gpuCtx.copyDeviceToHost(outputBlock);
@@ -300,7 +300,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), N, C * H * W);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W,
+ LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(0), getExtendedOpcode(), filter, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("maxpooling")) {
@@ -313,7 +313,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), N, C * P * Q);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
if(instOpcode.equalsIgnoreCase("maxpooling"))
- LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W,
+ LibMatrixCUDA.maxpooling(ec.getGPUContext(0), getExtendedOpcode(), image, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
@@ -328,7 +328,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction
ec.setMetaData(_output.getName(), N, C * H * W);
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, _output.getName());
- LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(), getExtendedOpcode(), image, dout, out, N, C, H, W,
+ LibMatrixCUDA.maxpoolingBackward(ec.getGPUContext(0), getExtendedOpcode(), image, dout, out, N, C, H, W,
K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
else {
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
index c147a6f..55656f0 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -107,7 +107,7 @@ public class MMTSJGPUInstruction extends GPUInstruction
//execute operations
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
+ LibMatrixCUDA.matmultTSMM(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName(), isLeftTransposed);
ec.releaseMatrixInputForGPUInstruction(_input.getName());
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
index 7b50285..beeacee 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java
@@ -44,35 +44,35 @@ public class MatrixBuiltinGPUInstruction extends BuiltinUnaryGPUInstruction {
switch(opcode) {
case "sel+":
- LibMatrixCUDA.relu(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.relu(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "exp":
- LibMatrixCUDA.exp(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.exp(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "sqrt":
- LibMatrixCUDA.sqrt(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.sqrt(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "log":
- LibMatrixCUDA.log(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.log(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "round":
- LibMatrixCUDA.round(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.round(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "floor":
- LibMatrixCUDA.floor(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.floor(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "ceil":
- LibMatrixCUDA.ceil(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.ceil(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "abs":
- LibMatrixCUDA.abs(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.abs(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "sin":
- LibMatrixCUDA.sin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.sin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "cos":
- LibMatrixCUDA.cos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.cos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "tan":
- LibMatrixCUDA.tan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.tan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "asin":
- LibMatrixCUDA.asin(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.asin(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "acos":
- LibMatrixCUDA.acos(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.acos(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "atan":
- LibMatrixCUDA.atan(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.atan(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
case "sign":
- LibMatrixCUDA.sign(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName()); break;
+ LibMatrixCUDA.sign(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName()); break;
default:
throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
index 9573a1a..a03f9b1 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixArithmeticGPUInstruction.java
@@ -71,7 +71,7 @@ public class MatrixMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
ec.setMetaData(_output.getName(), (int)rlen, (int)clen);
BinaryOperator bop = (BinaryOperator) _optr;
- LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
+ LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), isLeftTransposed, isRightTransposed, bop);
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
index 58905d6..e430e29 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixAxpyGPUInstruction.java
@@ -103,7 +103,7 @@ public class MatrixMatrixAxpyGPUInstruction extends ArithmeticBinaryGPUInstructi
" and input2:" + rlen2 + " X " + clen2);
}
- LibMatrixCUDA.axpy(ec, ec.getGPUContext(), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
+ LibMatrixCUDA.axpy(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, in2, _output.getName(), multiplier*scalar.getDoubleValue());
ec.releaseMatrixInputForGPUInstruction(_input1.getName());
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
index 8936735..e60a3d7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixMatrixBuiltinGPUInstruction.java
@@ -46,7 +46,7 @@ public class MatrixMatrixBuiltinGPUInstruction extends BuiltinBinaryGPUInstructi
if(opcode.equals("solve")) {
ec.setMetaData(output.getName(), mat1.getNumColumns(), 1);
- LibMatrixCUDA.solve(ec, ec.getGPUContext(), getExtendedOpcode(), mat1, mat2, output.getName());
+ LibMatrixCUDA.solve(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, mat2, output.getName());
} else {
throw new DMLRuntimeException("Unsupported GPU operator:" + opcode);
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
index 53d56a3..bc63d12 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ReorgGPUInstruction.java
@@ -81,7 +81,7 @@ public class ReorgGPUInstruction extends GPUInstruction
//execute operation
ec.setMetaData(_output.getName(), rlen, clen);
- LibMatrixCUDA.transpose(ec, ec.getGPUContext(), getExtendedOpcode(), mat, _output.getName());
+ LibMatrixCUDA.transpose(ec, ec.getGPUContext(0), getExtendedOpcode(), mat, _output.getName());
//release inputs/outputs
ec.releaseMatrixInputForGPUInstruction(_input.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
index 64cb6c4..ea4665a 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ScalarMatrixArithmeticGPUInstruction.java
@@ -60,7 +60,7 @@ public class ScalarMatrixArithmeticGPUInstruction extends ArithmeticBinaryGPUIns
ScalarOperator sc_op = (ScalarOperator) _optr;
sc_op.setConstant(constant.getDoubleValue());
- LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
+ LibMatrixCUDA.matrixScalarArithmetic(ec, ec.getGPUContext(0), getExtendedOpcode(), in1, _output.getName(), isTransposed, sc_op);
ec.releaseMatrixInputForGPUInstruction(mat.getName());
ec.releaseMatrixOutputForGPUInstruction(_output.getName());
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 0ff9d14..b15dd69 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -52,453 +52,477 @@ import jcuda.jcusparse.cusparsePointerMode;
*/
public class CSRPointer {
- private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
-
- private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
-
- /** {@link GPUContext} instance to track the GPU to do work on */
- private final GPUContext gpuContext;
-
- public static cusparseMatDescr matrixDescriptor;
-
- /** Number of non zeroes */
- public long nnz;
-
- /** double array of non zero values */
- public Pointer val;
-
- /** integer array of start of all rows and end of last row + 1 */
- public Pointer rowPtr;
-
- /** integer array of nnz values' column indices */
- public Pointer colInd;
-
- /** descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported */
- public cusparseMatDescr descr;
-
-
- public CSRPointer clone(int rows) throws DMLRuntimeException {
- CSRPointer me = this;
- CSRPointer that = new CSRPointer(me.getGPUContext());
-
- that.allocateMatDescrPointer();
- long totalSize = estimateSize(me.nnz, rows);
- that.gpuContext.ensureFreeSpace(totalSize);
-
- that.nnz = me.nnz;
- that.val = allocate(that.nnz * Sizeof.DOUBLE);
- that.rowPtr = allocate(rows * Sizeof.DOUBLE);
- that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
-
- cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
- cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
- cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
-
- return that;
- }
-
- /**
- * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
- * @param gCtx a valid {@link GPUContext}
- */
- private CSRPointer(GPUContext gCtx) {
- gpuContext = gCtx;
- val = new Pointer();
- rowPtr = new Pointer();
- colInd = new Pointer();
- allocateMatDescrPointer();
- }
-
-// private Pointer allocate(String instName, long size) throws DMLRuntimeException {
-// return getGPUContext().allocate(instName, size);
-// }
-
- private Pointer allocate(long size) throws DMLRuntimeException {
- return getGPUContext().allocate(size);
- }
-
-// private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
-// getGPUContext().cudaFreeHelper(toFree);
-// }
-
- private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
- getGPUContext().cudaFreeHelper(toFree, eager);
- }
-
-// private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
-// getGPUContext().cudaFreeHelper(instName, toFree, eager);
-// }
-
- private static long getDoubleSizeOf(long numElems) {
- return numElems * ((long)jcuda.Sizeof.DOUBLE);
- }
-
- private static long getIntSizeOf(long numElems) {
- return numElems * ((long)jcuda.Sizeof.INT);
- }
-
- private GPUContext getGPUContext() {
- return gpuContext;
- }
-
- public static int toIntExact(long l) throws DMLRuntimeException {
- if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
- throw new DMLRuntimeException("Cannot be cast to int:" + l);
- }
- return (int) l;
- }
-
- /**
- * @return Singleton default matrix descriptor object
- * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
- */
- public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
- if (matrixDescriptor == null) {
- // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
- matrixDescriptor = new cusparseMatDescr();
- cusparseCreateMatDescr(matrixDescriptor);
- cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
- cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
- }
- return matrixDescriptor;
- }
-
- /**
- * Estimate the size of a CSR matrix in GPU memory
- * Size of pointers is not needed and is not added in
- *
- * @param nnz2 number of non zeroes
- * @param rows number of rows
- * @return size estimate
- */
- public static long estimateSize(long nnz2, long rows) {
- long sizeofValArray = getDoubleSizeOf(nnz2);
- long sizeofRowPtrArray = getIntSizeOf(rows + 1);
- long sizeofColIndArray = getIntSizeOf(nnz2);
- long sizeofDescr = getIntSizeOf(4);
- // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
- // typedef struct {
- // cusparseMatrixType_t MatrixType;
- // cusparseFillMode_t FillMode;
- // cusparseDiagType_t DiagType;
- // cusparseIndexBase_t IndexBase;
- // } cusparseMatDescr_t;
- long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
- return tot;
- }
-
- /**
- * Static method to copy a CSR sparse matrix from Host to Device
- *
- * @param dest [input] destination location (on GPU)
- * @param rows number of rows
- * @param nnz number of non-zeroes
- * @param rowPtr integer array of row pointers
- * @param colInd integer array of column indices
- * @param values double array of non zero values
- */
- public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
- CSRPointer r = dest;
- long t0 = 0;
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- r.nnz = nnz;
- cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
- cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
- cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
- if (DMLScript.STATISTICS) GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaToDevCount.addAndGet(3);
- }
-
- /**
- * Static method to copy a CSR sparse matrix from Device to host
- *
- * @param src [input] source location (on GPU)
- * @param rows [input] number of rows
- * @param nnz [input] number of non-zeroes
- * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
- * @param colInd [output] pre-allocated integer array of column indices of size nnz
- * @param values [output] pre-allocated double array of values of size nnz
- */
- public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
- CSRPointer r = src;
- long t0 = 0;
- if (DMLScript.STATISTICS) t0 = System.nanoTime();
- cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
- cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
- cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
- if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
- if (DMLScript.STATISTICS) GPUStatistics.cudaFromDevCount.addAndGet(3);
- }
-
- /**
- * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
- * C = a op(A) + b op(B)
- * @param gCtx a valid {@link GPUContext}
- * @param handle a valid {@link cusparseHandle}
- * @param A Sparse Matrix A on GPU
- * @param B Sparse Matrix B on GPU
- * @param m Rows in A
- * @param n Columns in Bs
- * @return CSR (compressed sparse row) pointer
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m, int n)
- throws DMLRuntimeException {
- if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
- throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
- }
- CSRPointer C = new CSRPointer(gCtx);
- step1AllocateRowPointers(gCtx, handle, C, m);
- step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
- step3AllocateValNInd(gCtx, handle, C);
- return C;
- }
-
- /**
- * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
- * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
- *
- * @param gCtx ?
- * @param handle a valid {@link cusparseHandle}
- * @param A Sparse Matrix A on GPU
- * @param transA 'T' if A is to be transposed, 'N' otherwise
- * @param B Sparse Matrix B on GPU
- * @param transB 'T' if B is to be transposed, 'N' otherwise
- * @param m Rows in A
- * @param n Columns in B
- * @param k Columns in A / Rows in B
- * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, int m, int n, int k)
- throws DMLRuntimeException {
- // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
- // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
- CSRPointer C = new CSRPointer(gCtx);
- step1AllocateRowPointers(gCtx, handle, C, m);
- step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
- step3AllocateValNInd(gCtx, handle, C);
- return C;
- }
-
- /**
- * Check for ultra sparsity
- *
- * @param rows number of rows
- * @param cols number of columns
- * @return true if ultra sparse
- */
- public boolean isUltraSparse(int rows, int cols) {
- double sp = ((double) nnz / rows / cols);
- return sp < ULTRA_SPARSITY_TURN_POINT;
- }
-
-// ==============================================================================================
-
-// The following methods estimate the memory needed for sparse matrices that are
-// results of operations on other sparse matrices using the cuSparse Library.
-// The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
-// op = whether to transpose or not
-// binaryOperation = For cuSparse, +, - are *(matmul) are supported
-
-// From CuSparse Manual,
-// Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
-// to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
-// elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
-// and the total number of nonzero elements. In the second step, the user gathers nnzC
-//(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
-// or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
-// nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
-// to complete matrix C.
-
- /**
- * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
- * the default that works for DGEMM.
- */
- private void allocateMatDescrPointer() {
- this.descr = getDefaultCuSparseMatrixDescriptor();
- }
-
- /**
- * Factory method to allocate an empty CSR Sparse matrix on the GPU
- *
- * @param gCtx ?
- * @param nnz2 number of non-zeroes
- * @param rows number of rows
- * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- */
- public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException {
- LOG.trace("GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
- assert nnz2 > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
- CSRPointer r = new CSRPointer(gCtx);
- r.nnz = nnz2;
- if (nnz2 == 0) {
- // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
- // with no memory allocated on the GPU.
- return r;
- }
- gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
- // increment the cudaCount by 1 for the allocation of all 3 arrays
- r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
- r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
- r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
- return r;
- }
-
- /**
- * Allocate row pointers of m+1 elements
- * @param gCtx a valid {@link GPUContext}
- * @param handle a valid {@link cusparseHandle}
- * @param C Output matrix
- * @param rowsC number of rows in C
- * @throws DMLRuntimeException ?
- */
- private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC) throws DMLRuntimeException {
- LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
- cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
- //cudaDeviceSynchronize;
- // Do not increment the cudaCount of allocations on GPU
- C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
- }
-
- /**
- * Determine total number of nonzero element for the cusparseDgeam operation.
- * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
- * @param gCtx a valid {@link GPUContext}
- * @param handle a valid {@link cusparseHandle}
- * @param A Sparse Matrix A on GPU
- * @param B Sparse Matrix B on GPU
- * @param C Output Sparse Matrix C on GPU
- * @param m Rows in C
- * @param n Columns in C
- * @throws DMLRuntimeException ?
- */
- private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, CSRPointer C, int m, int n) throws DMLRuntimeException {
- LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
- int[] CnnzArray = {-1};
- cusparseXcsrgeamNnz(handle, m, n,
- A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
- B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
- C.descr, C.rowPtr, Pointer.to(CnnzArray));
- //cudaDeviceSynchronize;
- if (CnnzArray[0] != -1) {
- C.nnz = CnnzArray[0];
- } else {
- int baseArray[] = {0};
- cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
- cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
- C.nnz = CnnzArray[0] - baseArray[0];
- }
- }
-
-// ==============================================================================================
-
- /**
- * Determine total number of nonzero element for the cusparseDgemm operation.
- * @param gCtx a valid {@link GPUContext}
- * @param handle a valid {@link cusparseHandle}
- * @param A Sparse Matrix A on GPU
- * @param transA op - whether A is transposed
- * @param B Sparse Matrix B on GPU
- * @param transB op - whether B is transposed
- * @param C Output Sparse Matrix C on GPU
- * @param m Number of rows of sparse matrix op ( A ) and C
- * @param n Number of columns of sparse matrix op ( B ) and C
- * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B )
- * @throws DMLRuntimeException ?
- */
- private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA, CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
- LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
- int[] CnnzArray = {-1};
- if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
- throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
- }
- cusparseXcsrgemmNnz(handle, transA, transB, m, n, k,
- A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd,
- B.descr, toIntExact(B.nnz), B.rowPtr, B.colInd,
- C.descr, C.rowPtr, Pointer.to(CnnzArray));
- //cudaDeviceSynchronize;
- if (CnnzArray[0] != -1) {
- C.nnz = CnnzArray[0];
- } else {
- int baseArray[] = {0};
- cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1), cudaMemcpyDeviceToHost);
- cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
- C.nnz = CnnzArray[0] - baseArray[0];
- }
- }
-
- /**
- * Allocate val and index pointers.
- * @param gCtx a valid {@link GPUContext}
- * @param handle a valid {@link cusparseHandle}
- * @param C Output sparse matrix on GPU
- * @throws DMLRuntimeException ?
- */
- private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C) throws DMLRuntimeException {
- LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
- // Increment cudaCount by one when all three arrays of CSR sparse array are allocated
- C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
- C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
- }
-
- /**
- * Copies this CSR matrix on the GPU to a dense column-major matrix
- * on the GPU. This is a temporary matrix for operations such as
- * cusparseDcsrmv.
- * Since the allocated matrix is temporary, bookkeeping is not updated.
- * The caller is responsible for calling "free" on the returned Pointer object
- *
- * @param cusparseHandle a valid {@link cusparseHandle}
- * @param cublasHandle a valid {@link cublasHandle}
- * @param rows number of rows in this CSR matrix
- * @param cols number of columns in this CSR matrix
- * @throws DMLRuntimeException if DMLRuntimeException occurs
- * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
- */
- public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows, int cols) throws DMLRuntimeException {
- LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext=" + getGPUContext());
- long size = ((long) rows) * getDoubleSizeOf((long) cols);
- Pointer A = allocate(size);
- // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
- if (val != null && rowPtr != null && colInd != null && nnz > 0) {
- // Note: cusparseDcsr2dense method cannot handle empty blocks
- cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
- //cudaDeviceSynchronize;
- } else {
- LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
- }
- return A;
- }
-
- /**
- * Calls cudaFree lazily on the allocated {@link Pointer} instances
- *
- * @throws DMLRuntimeException ?
- */
- public void deallocate() throws DMLRuntimeException {
- deallocate(false);
- }
-
- /**
- * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
- *
- * @param eager whether to do eager or lazy cudaFrees
- * @throws DMLRuntimeException ?
- */
- public void deallocate(boolean eager) throws DMLRuntimeException {
- if (nnz > 0) {
- cudaFreeHelper(val, eager);
- cudaFreeHelper(rowPtr, eager);
- cudaFreeHelper(colInd, eager);
- }
- }
-
- @Override
- public String toString() {
- return "CSRPointer{" +
- "nnz=" + nnz +
- '}';
- }
+ private static final Log LOG = LogFactory.getLog(CSRPointer.class.getName());
+
+ private static final double ULTRA_SPARSITY_TURN_POINT = 0.0004;
+ public static cusparseMatDescr matrixDescriptor;
+ /**
+ * {@link GPUContext} instance to track the GPU to do work on
+ */
+ private final GPUContext gpuContext;
+ /**
+ * Number of non zeroes
+ */
+ public long nnz;
+
+ /**
+ * double array of non zero values
+ */
+ public Pointer val;
+
+ /**
+ * integer array of start of all rows and end of last row + 1
+ */
+ public Pointer rowPtr;
+
+ /**
+ * integer array of nnz values' column indices
+ */
+ public Pointer colInd;
+
+ /**
+ * descriptor of matrix, only CUSPARSE_MATRIX_TYPE_GENERAL supported
+ */
+ public cusparseMatDescr descr;
+
+ /**
+ * Default constructor to help with Factory method {@link #allocateEmpty(GPUContext, long, long)}
+ *
+ * @param gCtx a valid {@link GPUContext}
+ */
+ private CSRPointer(GPUContext gCtx) {
+ gpuContext = gCtx;
+ val = new Pointer();
+ rowPtr = new Pointer();
+ colInd = new Pointer();
+ allocateMatDescrPointer();
+ }
+
+ private static long getDoubleSizeOf(long numElems) {
+ return numElems * ((long) jcuda.Sizeof.DOUBLE);
+ }
+
+ // private Pointer allocate(String instName, long size) throws DMLRuntimeException {
+ // return getGPUContext().allocate(instName, size);
+ // }
+
+ private static long getIntSizeOf(long numElems) {
+ return numElems * ((long) jcuda.Sizeof.INT);
+ }
+
+ // private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
+ // getGPUContext().cudaFreeHelper(toFree);
+ // }
+
+ public static int toIntExact(long l) throws DMLRuntimeException {
+ if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Cannot be cast to int:" + l);
+ }
+ return (int) l;
+ }
+
+ // private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) throws DMLRuntimeException {
+ // getGPUContext().cudaFreeHelper(instName, toFree, eager);
+ // }
+
+ /**
+ * @return Singleton default matrix descriptor object
+ * (set with CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ZERO)
+ */
+ public static cusparseMatDescr getDefaultCuSparseMatrixDescriptor() {
+ if (matrixDescriptor == null) {
+ // Code from JCuda Samples - http://www.jcuda.org/samples/JCusparseSample.java
+ matrixDescriptor = new cusparseMatDescr();
+ cusparseCreateMatDescr(matrixDescriptor);
+ cusparseSetMatType(matrixDescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
+ cusparseSetMatIndexBase(matrixDescriptor, CUSPARSE_INDEX_BASE_ZERO);
+ }
+ return matrixDescriptor;
+ }
+
+ /**
+ * Estimate the size of a CSR matrix in GPU memory
+ * Size of pointers is not needed and is not added in
+ *
+ * @param nnz2 number of non zeroes
+ * @param rows number of rows
+ * @return size estimate
+ */
+ public static long estimateSize(long nnz2, long rows) {
+ long sizeofValArray = getDoubleSizeOf(nnz2);
+ long sizeofRowPtrArray = getIntSizeOf(rows + 1);
+ long sizeofColIndArray = getIntSizeOf(nnz2);
+ long sizeofDescr = getIntSizeOf(4);
+ // From the CUSPARSE documentation, the cusparseMatDescr in native code is represented as:
+ // typedef struct {
+ // cusparseMatrixType_t MatrixType;
+ // cusparseFillMode_t FillMode;
+ // cusparseDiagType_t DiagType;
+ // cusparseIndexBase_t IndexBase;
+ // } cusparseMatDescr_t;
+ long tot = sizeofValArray + sizeofRowPtrArray + sizeofColIndArray + sizeofDescr;
+ return tot;
+ }
+
+ /**
+ * Static method to copy a CSR sparse matrix from Host to Device
+ *
+ * @param dest [input] destination location (on GPU)
+ * @param rows number of rows
+ * @param nnz number of non-zeroes
+ * @param rowPtr integer array of row pointers
+ * @param colInd integer array of column indices
+ * @param values double array of non zero values
+ */
+ public static void copyToDevice(CSRPointer dest, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+ CSRPointer r = dest;
+ long t0 = 0;
+ if (DMLScript.STATISTICS)
+ t0 = System.nanoTime();
+ r.nnz = nnz;
+ cudaMemcpy(r.rowPtr, Pointer.to(rowPtr), getIntSizeOf(rows + 1), cudaMemcpyHostToDevice);
+ cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice);
+ cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaToDevCount.addAndGet(3);
+ }
+
+ /**
+ * Static method to copy a CSR sparse matrix from Device to host
+ *
+ * @param src [input] source location (on GPU)
+ * @param rows [input] number of rows
+ * @param nnz [input] number of non-zeroes
+ * @param rowPtr [output] pre-allocated integer array of row pointers of size (rows+1)
+ * @param colInd [output] pre-allocated integer array of column indices of size nnz
+ * @param values [output] pre-allocated double array of values of size nnz
+ */
+ public static void copyToHost(CSRPointer src, int rows, long nnz, int[] rowPtr, int[] colInd, double[] values) {
+ CSRPointer r = src;
+ long t0 = 0;
+ if (DMLScript.STATISTICS)
+ t0 = System.nanoTime();
+ cudaMemcpy(Pointer.to(rowPtr), r.rowPtr, getIntSizeOf(rows + 1), cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaFromDevCount.addAndGet(3);
+ }
+
+ /**
+ * Estimates the number of non zero elements from the results of a sparse cusparseDgeam operation
+ * C = a op(A) + b op(B)
+ *
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param B Sparse Matrix B on GPU
+ * @param m Rows in A
+ * @param n Columns in Bs
+ * @return CSR (compressed sparse row) pointer
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateForDgeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B, int m,
+ int n) throws DMLRuntimeException {
+ if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
+ }
+ CSRPointer C = new CSRPointer(gCtx);
+ step1AllocateRowPointers(gCtx, handle, C, m);
+ step2GatherNNZGeam(gCtx, handle, A, B, C, m, n);
+ step3AllocateValNInd(gCtx, handle, C);
+ return C;
+ }
+
+ /**
+ * Estimates the number of non-zero elements from the result of a sparse matrix multiplication C = A * B
+ * and returns the {@link CSRPointer} to C with the appropriate GPU memory.
+ *
+ * @param gCtx ?
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param transA 'T' if A is to be transposed, 'N' otherwise
+ * @param B Sparse Matrix B on GPU
+ * @param transB 'T' if B is to be transposed, 'N' otherwise
+ * @param m Rows in A
+ * @param n Columns in B
+ * @param k Columns in A / Rows in B
+ * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateForMatrixMultiply(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
+ CSRPointer B, int transB, int m, int n, int k) throws DMLRuntimeException {
+ // Following the code example at http://docs.nvidia.com/cuda/cusparse/#cusparse-lt-t-gt-csrgemm and at
+ // https://github.com/jcuda/jcuda-matrix-utils/blob/master/JCudaMatrixUtils/src/test/java/org/jcuda/matrix/samples/JCusparseSampleDgemm.java
+ CSRPointer C = new CSRPointer(gCtx);
+ step1AllocateRowPointers(gCtx, handle, C, m);
+ step2GatherNNZGemm(gCtx, handle, A, transA, B, transB, C, m, n, k);
+ step3AllocateValNInd(gCtx, handle, C);
+ return C;
+ }
+
+ /**
+ * Factory method to allocate an empty CSR Sparse matrix on the GPU
+ *
+ * @param gCtx ?
+ * @param nnz2 number of non-zeroes
+ * @param rows number of rows
+ * @return a {@link CSRPointer} instance that encapsulates the CSR matrix on GPU
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static CSRPointer allocateEmpty(GPUContext gCtx, long nnz2, long rows) throws DMLRuntimeException {
+ LOG.trace(
+ "GPU : allocateEmpty from CSRPointer with nnz=" + nnz2 + " and rows=" + rows + ", GPUContext=" + gCtx);
+ assert nnz2
+ > -1 : "Incorrect usage of internal API, number of non zeroes is less than 0 when trying to allocate sparse data on GPU";
+ CSRPointer r = new CSRPointer(gCtx);
+ r.nnz = nnz2;
+ if (nnz2 == 0) {
+ // The convention for an empty sparse matrix is to just have an instance of the CSRPointer object
+ // with no memory allocated on the GPU.
+ return r;
+ }
+ gCtx.ensureFreeSpace(getDoubleSizeOf(nnz2) + getIntSizeOf(rows + 1) + getIntSizeOf(nnz2));
+ // increment the cudaCount by 1 for the allocation of all 3 arrays
+ r.val = gCtx.allocate(null, getDoubleSizeOf(nnz2));
+ r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
+ r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+ return r;
+ }
+
+ /**
+ * Allocate row pointers of m+1 elements
+ *
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param C Output matrix
+ * @param rowsC number of rows in C
+ * @throws DMLRuntimeException ?
+ */
+ private static void step1AllocateRowPointers(GPUContext gCtx, cusparseHandle handle, CSRPointer C, int rowsC)
+ throws DMLRuntimeException {
+ LOG.trace("GPU : step1AllocateRowPointers" + ", GPUContext=" + gCtx);
+ cusparseSetPointerMode(handle, cusparsePointerMode.CUSPARSE_POINTER_MODE_HOST);
+ //cudaDeviceSynchronize;
+ // Do not increment the cudaCount of allocations on GPU
+ C.rowPtr = gCtx.allocate(getIntSizeOf((long) rowsC + 1));
+ }
+
+ /**
+ * Determine total number of nonzero element for the cusparseDgeam operation.
+ * This is done from either (nnzC=*nnzTotalDevHostPtr) or (nnzC=csrRowPtrC(m)-csrRowPtrC(0))
+ *
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param B Sparse Matrix B on GPU
+ * @param C Output Sparse Matrix C on GPU
+ * @param m Rows in C
+ * @param n Columns in C
+ * @throws DMLRuntimeException ?
+ */
+ private static void step2GatherNNZGeam(GPUContext gCtx, cusparseHandle handle, CSRPointer A, CSRPointer B,
+ CSRPointer C, int m, int n) throws DMLRuntimeException {
+ LOG.trace("GPU : step2GatherNNZGeam for DGEAM" + ", GPUContext=" + gCtx);
+ int[] CnnzArray = { -1 };
+ cusparseXcsrgeamNnz(handle, m, n, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr, toIntExact(B.nnz),
+ B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
+ //cudaDeviceSynchronize;
+ if (CnnzArray[0] != -1) {
+ C.nnz = CnnzArray[0];
+ } else {
+ int baseArray[] = { 0 };
+ cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+ cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+ C.nnz = CnnzArray[0] - baseArray[0];
+ }
+ }
+
+ /**
+ * Determine total number of nonzero element for the cusparseDgemm operation.
+ *
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param A Sparse Matrix A on GPU
+ * @param transA op - whether A is transposed
+ * @param B Sparse Matrix B on GPU
+ * @param transB op - whether B is transposed
+ * @param C Output Sparse Matrix C on GPU
+ * @param m Number of rows of sparse matrix op ( A ) and C
+ * @param n Number of columns of sparse matrix op ( B ) and C
+ * @param k Number of columns/rows of sparse matrix op ( A ) / op ( B )
+ * @throws DMLRuntimeException ?
+ */
+ private static void step2GatherNNZGemm(GPUContext gCtx, cusparseHandle handle, CSRPointer A, int transA,
+ CSRPointer B, int transB, CSRPointer C, int m, int n, int k) throws DMLRuntimeException {
+ LOG.trace("GPU : step2GatherNNZGemm for DGEMM" + ", GPUContext=" + gCtx);
+ int[] CnnzArray = { -1 };
+ if (A.nnz >= Integer.MAX_VALUE || B.nnz >= Integer.MAX_VALUE) {
+ throw new DMLRuntimeException("Number of non zeroes is larger than supported by cuSparse");
+ }
+ cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, A.descr, toIntExact(A.nnz), A.rowPtr, A.colInd, B.descr,
+ toIntExact(B.nnz), B.rowPtr, B.colInd, C.descr, C.rowPtr, Pointer.to(CnnzArray));
+ //cudaDeviceSynchronize;
+ if (CnnzArray[0] != -1) {
+ C.nnz = CnnzArray[0];
+ } else {
+ int baseArray[] = { 0 };
+ cudaMemcpy(Pointer.to(CnnzArray), C.rowPtr.withByteOffset(getIntSizeOf(m)), getIntSizeOf(1),
+ cudaMemcpyDeviceToHost);
+ cudaMemcpy(Pointer.to(baseArray), C.rowPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost);
+ C.nnz = CnnzArray[0] - baseArray[0];
+ }
+ }
+
+ /**
+ * Allocate val and index pointers.
+ *
+ * @param gCtx a valid {@link GPUContext}
+ * @param handle a valid {@link cusparseHandle}
+ * @param C Output sparse matrix on GPU
+ * @throws DMLRuntimeException ?
+ */
+ private static void step3AllocateValNInd(GPUContext gCtx, cusparseHandle handle, CSRPointer C)
+ throws DMLRuntimeException {
+ LOG.trace("GPU : step3AllocateValNInd" + ", GPUContext=" + gCtx);
+ // Increment cudaCount by one when all three arrays of CSR sparse array are allocated
+ C.val = gCtx.allocate(null, getDoubleSizeOf(C.nnz));
+ C.colInd = gCtx.allocate(null, getIntSizeOf(C.nnz));
+ }
+
+ // ==============================================================================================
+
+ // The following methods estimate the memory needed for sparse matrices that are
+ // results of operations on other sparse matrices using the cuSparse Library.
+ // The operation is C = op(A) binaryOperation op(B), C is the output and A & B are the inputs
+ // op = whether to transpose or not
+ // binaryOperation = For cuSparse, +, - are *(matmul) are supported
+
+ // From CuSparse Manual,
+ // Since A and B have different sparsity patterns, cuSPARSE adopts a two-step approach
+ // to complete sparse matrix C. In the first step, the user allocates csrRowPtrC of m+1
+ // elements and uses function cusparseXcsrgeamNnz() to determine csrRowPtrC
+ // and the total number of nonzero elements. In the second step, the user gathers nnzC
+ //(number of nonzero elements of matrix C) from either (nnzC=*nnzTotalDevHostPtr)
+ // or (nnzC=csrRowPtrC(m)-csrRowPtrC(0)) and allocates csrValC, csrColIndC of
+ // nnzC elements respectively, then finally calls function cusparse[S|D|C|Z]csrgeam()
+ // to complete matrix C.
+
+ public CSRPointer clone(int rows) throws DMLRuntimeException {
+ CSRPointer me = this;
+ CSRPointer that = new CSRPointer(me.getGPUContext());
+
+ that.allocateMatDescrPointer();
+ long totalSize = estimateSize(me.nnz, rows);
+ that.gpuContext.ensureFreeSpace(totalSize);
+
+ that.nnz = me.nnz;
+ that.val = allocate(that.nnz * Sizeof.DOUBLE);
+ that.rowPtr = allocate(rows * Sizeof.DOUBLE);
+ that.colInd = allocate(that.nnz * Sizeof.DOUBLE);
+
+ cudaMemcpy(that.val, me.val, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+ cudaMemcpy(that.rowPtr, me.rowPtr, rows * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+ cudaMemcpy(that.colInd, me.colInd, that.nnz * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
+
+ return that;
+ }
+
+ private Pointer allocate(long size) throws DMLRuntimeException {
+ return getGPUContext().allocate(size);
+ }
+
+ private void cudaFreeHelper(Pointer toFree, boolean eager) throws DMLRuntimeException {
+ getGPUContext().cudaFreeHelper(toFree, eager);
+ }
+
+ private GPUContext getGPUContext() {
+ return gpuContext;
+ }
+
+ // ==============================================================================================
+
+ /**
+ * Check for ultra sparsity
+ *
+ * @param rows number of rows
+ * @param cols number of columns
+ * @return true if ultra sparse
+ */
+ public boolean isUltraSparse(int rows, int cols) {
+ double sp = ((double) nnz / rows / cols);
+ return sp < ULTRA_SPARSITY_TURN_POINT;
+ }
+
+ /**
+ * Initializes {@link #descr} to CUSPARSE_MATRIX_TYPE_GENERAL,
+ * the default that works for DGEMM.
+ */
+ private void allocateMatDescrPointer() {
+ this.descr = getDefaultCuSparseMatrixDescriptor();
+ }
+
+ /**
+ * Copies this CSR matrix on the GPU to a dense column-major matrix
+ * on the GPU. This is a temporary matrix for operations such as
+ * cusparseDcsrmv.
+ * Since the allocated matrix is temporary, bookkeeping is not updated.
+ * The caller is responsible for calling "free" on the returned Pointer object
+ *
+ * @param cusparseHandle a valid {@link cusparseHandle}
+ * @param cublasHandle a valid {@link cublasHandle}
+ * @param rows number of rows in this CSR matrix
+ * @param cols number of columns in this CSR matrix
+ * @return A {@link Pointer} to the allocated dense matrix (in column-major format)
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public Pointer toColumnMajorDenseMatrix(cusparseHandle cusparseHandle, cublasHandle cublasHandle, int rows,
+ int cols) throws DMLRuntimeException {
+ LOG.trace("GPU : sparse -> column major dense (inside CSRPointer) on " + this + ", GPUContext="
+ + getGPUContext());
+ long size = ((long) rows) * getDoubleSizeOf((long) cols);
+ Pointer A = allocate(size);
+ // If this sparse block is empty, the allocated dense matrix, initialized to zeroes, will be returned.
+ if (val != null && rowPtr != null && colInd != null && nnz > 0) {
+ // Note: cusparseDcsr2dense method cannot handle empty blocks
+ cusparseDcsr2dense(cusparseHandle, rows, cols, descr, val, rowPtr, colInd, A, rows);
+ //cudaDeviceSynchronize;
+ } else {
+ LOG.debug("in CSRPointer, the values array, row pointers array or column indices array was null");
+ }
+ return A;
+ }
+
+ /**
+ * Calls cudaFree lazily on the allocated {@link Pointer} instances
+ *
+ * @throws DMLRuntimeException ?
+ */
+ public void deallocate() throws DMLRuntimeException {
+ deallocate(false);
+ }
+
+ /**
+ * Calls cudaFree lazily or eagerly on the allocated {@link Pointer} instances
+ *
+ * @param eager whether to do eager or lazy cudaFrees
+ * @throws DMLRuntimeException ?
+ */
+ public void deallocate(boolean eager) throws DMLRuntimeException {
+ if (nnz > 0) {
+ cudaFreeHelper(val, eager);
+ cudaFreeHelper(rowPtr, eager);
+ cudaFreeHelper(colInd, eager);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "CSRPointer{" + "nnz=" + nnz + '}';
+ }
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/f5871756/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index ce5c5ff..ef000c2 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -31,25 +31,24 @@ import jcuda.driver.CUstream;
* Java Wrapper to specify CUDA execution configuration for launching custom kernels
*/
public class ExecutionConfig {
- public int gridDimX;
+ public int gridDimX;
public int gridDimY = 1;
public int gridDimZ = 1;
- public int blockDimX;
+ public int blockDimX;
public int blockDimY = 1;
public int blockDimZ = 1;
public int sharedMemBytes = 0;
public CUstream stream = null;
-
-
private static HashMap<Integer, Integer> maxBlockDimForDevice = new HashMap<Integer, Integer>();
/**
* Convenience constructor for setting the number of blocks, number of threads and the
* shared memory size
- * @param gridDimX Number of blocks (for CUDA Kernel)
- * @param blockDimX Number of threads per block (for CUDA Kernel)
- * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel)
+ *
+ * @param gridDimX Number of blocks (for CUDA Kernel)
+ * @param blockDimX Number of threads per block (for CUDA Kernel)
+ * @param sharedMemBytes Amount of Shared memory (for CUDA Kernel)
*/
public ExecutionConfig(int gridDimX, int blockDimX, int sharedMemBytes) {
this.gridDimX = gridDimX;
@@ -58,13 +57,13 @@ public class ExecutionConfig {
}
/**
- * Use this for simple vector operations and use following in the kernel
- * <code>
- * int index = blockIdx.x * blockDim.x + threadIdx.x
+ * Use this for simple vector operations and use following in the kernel
+ * <code>
+ * int index = blockIdx.x * blockDim.x + threadIdx.x
* </code>
- *
+ * <p>
* This tries to schedule as minimum grids as possible.
- *
+ *
* @param numCells number of cells
* @return execution configuration
* @throws DMLRuntimeException if DMLRuntimeException occurs
@@ -72,19 +71,19 @@ public class ExecutionConfig {
public static ExecutionConfig getConfigForSimpleVectorOperations(int numCells) throws DMLRuntimeException {
int deviceNumber = 0;
int blockDimX = getMaxBlockDim(deviceNumber);
- int gridDimX = (int)Math.ceil((double)numCells / blockDimX);
+ int gridDimX = (int) Math.ceil((double) numCells / blockDimX);
return new ExecutionConfig(gridDimX, blockDimX);
}
-
+
/**
- * Use this for simple matrix operations and use following in the kernel
- * <code>
+ * Use this for simple matrix operations and use following in the kernel
+ * <code>
* int ix = blockIdx.x * blockDim.x + threadIdx.x;
* int iy = blockIdx.y * blockDim.y + threadIdx.y;
* </code>
- *
+ * <p>
* This tries to schedule as minimum grids as possible.
- *
+ *
* @param rlen number of rows
* @param clen number of columns
* @return execution configuration
@@ -94,45 +93,45 @@ public class ExecutionConfig {
int deviceNumber = 0;
int maxBlockDim = getMaxBlockDim(deviceNumber);
int blockDimX = (int) Math.min(maxBlockDim, rlen);
- int gridDimX = (int)Math.ceil((double)rlen / blockDimX);
- int blockDimY = (int)Math.min(Math.floor(((double)maxBlockDim)/blockDimX), clen);
- int gridDimY = (int)Math.ceil((double)clen / blockDimY);
+ int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
+ int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen);
+ int gridDimY = (int) Math.ceil((double) clen / blockDimY);
return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
}
-
+
public ExecutionConfig(int gridDimX, int blockDimX) {
this.gridDimX = gridDimX;
this.blockDimX = blockDimX;
}
-
+
public ExecutionConfig(int gridDimX, int gridDimY, int blockDimX, int blockDimY) {
this.gridDimX = gridDimX;
this.gridDimY = gridDimY;
this.blockDimX = blockDimX;
this.blockDimY = blockDimY;
}
-
-
+
/**
- * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
- *
+ * Get the CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X of the given device
+ *
* @param deviceNumber device number of the given device
* @return The maximum block dimension, in x-direction
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException {
-// return 32;
- // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
- Integer ret = maxBlockDimForDevice.get(deviceNumber);
- if(ret == null) {
- CUdevice device = new CUdevice();
- JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber));
- int maxBlockDimX[] = {0};
- jcuda.driver.JCudaDriver.cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
- maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
- return maxBlockDimX[0];
- }
- return ret;
- }
-
- }
+ private static int getMaxBlockDim(int deviceNumber) throws DMLRuntimeException {
+ // return 32;
+ // TODO: Use JCudaDriver.cuOccupancyMaxPotentialBlockSize to chose the block size that maximizes occupancy
+ Integer ret = maxBlockDimForDevice.get(deviceNumber);
+ if (ret == null) {
+ CUdevice device = new CUdevice();
+ JCudaKernels.checkResult(jcuda.driver.JCudaDriver.cuDeviceGet(device, deviceNumber));
+ int maxBlockDimX[] = { 0 };
+ jcuda.driver.JCudaDriver
+ .cuDeviceGetAttribute(maxBlockDimX, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device);
+ maxBlockDimForDevice.put(deviceNumber, maxBlockDimX[0]);
+ return maxBlockDimX[0];
+ }
+ return ret;
+ }
+
+}