You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/08/08 20:29:53 UTC
systemml git commit: [MINOR] bug fixes in the GPU backend
Repository: systemml
Updated Branches:
refs/heads/master 98a9d653d -> 815ca4f2a
[MINOR] bug fixes in the GPU backend
- Each thread is assigned a cuda library handle
- JCudaKernels is also made thread safe
- Removed setting GPUContext to null
- Bug fix in initial gpu budget estimate
- Cuda Kernels use blockId.x and threadId.x only
Closes #607
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/815ca4f2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/815ca4f2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/815ca4f2
Branch: refs/heads/master
Commit: 815ca4f2aedcbe491d10a873db99a9b5e6f29226
Parents: 98a9d65
Author: Nakul Jindal <na...@gmail.com>
Authored: Tue Aug 8 13:29:11 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Tue Aug 8 13:29:11 2017 -0700
----------------------------------------------------------------------
src/main/cpp/kernels/SystemML.cu | 54 +--
src/main/cpp/kernels/SystemML.ptx | 333 +++++++++----------
.../controlprogram/ParForProgramBlock.java | 3 -
.../controlprogram/parfor/LocalParWorker.java | 12 +-
.../cp/FunctionCallCPInstruction.java | 7 -
.../gpu/context/ExecutionConfig.java | 26 +-
.../instructions/gpu/context/GPUContext.java | 94 +++---
.../gpu/context/GPUContextPool.java | 2 +-
.../instructions/gpu/context/JCudaKernels.java | 5 +-
.../org/apache/sysml/test/gpu/GPUTests.java | 18 +
.../test/gpu/MatrixMultiplicationOpTest.java | 1 +
11 files changed, 303 insertions(+), 252 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 297269f..dcd64b2 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -35,12 +35,13 @@ nvcc -ptx -arch=sm_30 SystemML.cu
*/
extern "C"
__global__ void copy_u2l_dense(double* ret, int dim, int N) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / dim;
+ int iy = tid % dim;
int id_dest = iy * dim + ix;
if(iy > ix && id_dest < N) {
// TODO: Potential to reduce the number of threads by half
- int id_src = ix * dim + iy;
+ int id_src = tid;
ret[id_dest] = ret[id_src];
}
}
@@ -104,8 +105,9 @@ __forceinline__ __device__ double binaryOp(double x, double y, int op) {
extern "C"
__global__ void relu(double* A, double* ret, int rlen, int clen) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clen;
+ int iy = tid % clen;
if(ix < rlen && iy < clen) {
int index = ix * clen + iy;
ret[index] = max(0.0, A[index]);
@@ -115,8 +117,9 @@ __global__ void relu(double* A, double* ret, int rlen, int clen) {
// This method computes the backpropagation errors for previous layer of relu operation
extern "C"
__global__ void relu_backward(double* X, double* dout, double* ret, int rlen, int clen) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clen;
+ int iy = tid % clen;
if(ix < rlen && iy < clen) {
int index = ix * clen + iy;
ret[index] = X[index] > 0 ? dout[index] : 0;
@@ -129,8 +132,9 @@ __global__ void relu_backward(double* X, double* dout, double* ret, int rlen, i
// This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
extern "C"
__global__ void bias_add(double* input, double* bias, double* ret, int rlen, int clen, int PQ) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clen;
+ int iy = tid % clen;
if(ix < rlen && iy < clen) {
int index = ix * clen + iy;
int biasIndex = iy / PQ;
@@ -141,8 +145,9 @@ __global__ void bias_add(double* input, double* bias, double* ret, int rlen, in
// Performs the operation "ret <- A + alpha*B", where B is a vector
extern "C"
__global__ void daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clenA;
+ int iy = tid % clenA;
if(ix < rlenA && iy < clenA) {
int index = ix * clenA + iy;
if(rlenB == 1) {
@@ -157,8 +162,9 @@ __global__ void daxpy_matrix_vector(double* A, double* B, double alpha, double*
// Performs similar operation as bias_add except elementwise multiplication instead of add
extern "C"
__global__ void bias_multiply(double* input, double* bias, double* ret, int rlen, int clen, int PQ) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clen;
+ int iy = tid % clen;
if(ix < rlen && iy < clen) {
int index = ix * clen + iy;
int biasIndex = iy / PQ;
@@ -169,8 +175,9 @@ __global__ void bias_multiply(double* input, double* bias, double* ret, int rle
// Compares the value and set
extern "C"
__global__ void compare_and_set(double* A, double* ret, int rlen, int clen, double compareVal, double tol, double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / clen;
+ int iy = tid % clen;
int index = ix * clen + iy;
if(ix < rlen && iy < clen) {
if(abs(A[index]-compareVal) < tol)
@@ -199,8 +206,9 @@ __global__ void compare_and_set(double* A, double* ret, int rlen, int clen, dou
extern "C"
__global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / maxClen;
+ int iy = tid % maxClen;
if(ix < maxRlen && iy < maxClen) {
int outIndex = ix * maxClen + iy;
@@ -273,8 +281,10 @@ __global__ void fill(double* A, double scalar, int lenA) {
*/
extern "C"
__global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int maxClen = max(colsA, colsB);
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / maxClen;
+ int iy = tid % maxClen;
int colsC = colsA + colsB;
int rowsC = rowsA;
@@ -310,8 +320,10 @@ __global__ void cbind(double *A, double *B, double *C, int rowsA, int colsA, int
*/
extern "C"
__global__ void rbind(double *A, double *B, double *C, int rowsA, int colsA, int rowsB, int colsB) {
- int ix = blockIdx.x * blockDim.x + threadIdx.x;
- int iy = blockIdx.y * blockDim.y + threadIdx.y;
+ int maxClen = max(colsA, colsB);
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ int ix = tid / maxClen;
+ int iy = tid % maxClen;
int rowsC = rowsA + rowsB;
int colsC = colsA;
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/cpp/kernels/SystemML.ptx
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.ptx b/src/main/cpp/kernels/SystemML.ptx
index 6884d5b..7778317 100644
--- a/src/main/cpp/kernels/SystemML.ptx
+++ b/src/main/cpp/kernels/SystemML.ptx
@@ -34,36 +34,33 @@
)
{
.reg .pred %p<4>;
- .reg .b32 %r<13>;
+ .reg .b32 %r<10>;
.reg .f64 %fd<2>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [copy_u2l_dense_param_0];
- ld.param.u32 %r4, [copy_u2l_dense_param_1];
- ld.param.u32 %r5, [copy_u2l_dense_param_2];
- mov.u32 %r6, %ntid.x;
- mov.u32 %r7, %ctaid.x;
- mov.u32 %r8, %tid.x;
- mad.lo.s32 %r1, %r6, %r7, %r8;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %ctaid.y;
- mov.u32 %r11, %tid.y;
- mad.lo.s32 %r2, %r9, %r10, %r11;
- mad.lo.s32 %r3, %r2, %r4, %r1;
- setp.gt.s32 %p1, %r2, %r1;
- setp.lt.s32 %p2, %r3, %r5;
+ ld.param.u32 %r3, [copy_u2l_dense_param_1];
+ ld.param.u32 %r4, [copy_u2l_dense_param_2];
+ mov.u32 %r5, %ntid.x;
+ mov.u32 %r6, %ctaid.x;
+ mov.u32 %r7, %tid.x;
+ mad.lo.s32 %r1, %r5, %r6, %r7;
+ div.s32 %r8, %r1, %r3;
+ rem.s32 %r9, %r1, %r3;
+ mad.lo.s32 %r2, %r9, %r3, %r8;
+ setp.gt.s32 %p1, %r9, %r8;
+ setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@!%p3 bra BB0_2;
bra.uni BB0_1;
BB0_1:
cvta.to.global.u64 %rd2, %rd1;
- mad.lo.s32 %r12, %r1, %r4, %r2;
- mul.wide.s32 %rd3, %r12, 8;
+ mul.wide.s32 %rd3, %r1, 8;
add.s64 %rd4, %rd2, %rd3;
ld.global.f64 %fd1, [%rd4];
- mul.wide.s32 %rd5, %r3, 8;
+ mul.wide.s32 %rd5, %r2, 8;
add.s64 %rd6, %rd2, %rd5;
st.global.f64 [%rd6], %fd1;
@@ -80,7 +77,7 @@ BB0_2:
)
{
.reg .pred %p<4>;
- .reg .b32 %r<12>;
+ .reg .b32 %r<10>;
.reg .f64 %fd<4>;
.reg .b64 %rd<8>;
@@ -93,20 +90,18 @@ BB0_2:
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r6, %r5, %r7;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r2, %r8, %r9, %r10;
- setp.lt.s32 %p1, %r1, %r4;
- setp.lt.s32 %p2, %r2, %r3;
+ div.s32 %r2, %r1, %r3;
+ setp.lt.s32 %p1, %r2, %r4;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB1_2;
bra.uni BB1_1;
BB1_1:
+ rem.s32 %r8, %r1, %r3;
cvta.to.global.u64 %rd3, %rd1;
- mad.lo.s32 %r11, %r1, %r3, %r2;
- mul.wide.s32 %rd4, %r11, 8;
+ mad.lo.s32 %r9, %r2, %r3, %r8;
+ mul.wide.s32 %rd4, %r9, 8;
add.s64 %rd5, %rd3, %rd4;
ld.global.f64 %fd1, [%rd5];
mov.f64 %fd2, 0d0000000000000000;
@@ -129,7 +124,7 @@ BB1_2:
)
{
.reg .pred %p<5>;
- .reg .b32 %r<12>;
+ .reg .b32 %r<10>;
.reg .f64 %fd<6>;
.reg .b64 %rd<14>;
@@ -143,21 +138,19 @@ BB1_2:
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r2, %r8, %r9, %r10;
- setp.lt.s32 %p1, %r1, %r4;
- setp.lt.s32 %p2, %r2, %r3;
+ div.s32 %r2, %r1, %r3;
+ setp.lt.s32 %p1, %r2, %r4;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB2_4;
bra.uni BB2_1;
BB2_1:
+ rem.s32 %r8, %r1, %r3;
cvta.to.global.u64 %rd5, %rd2;
- mad.lo.s32 %r11, %r1, %r3, %r2;
- cvt.s64.s32 %rd1, %r11;
- mul.wide.s32 %rd6, %r11, 8;
+ mad.lo.s32 %r9, %r2, %r3, %r8;
+ cvt.s64.s32 %rd1, %r9;
+ mul.wide.s32 %rd6, %r9, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd4, [%rd7];
mov.f64 %fd5, 0d0000000000000000;
@@ -190,7 +183,7 @@ BB2_4:
)
{
.reg .pred %p<4>;
- .reg .b32 %r<14>;
+ .reg .b32 %r<12>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
@@ -205,24 +198,22 @@ BB2_4:
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %ctaid.y;
- mov.u32 %r11, %tid.y;
- mad.lo.s32 %r2, %r9, %r10, %r11;
- setp.lt.s32 %p1, %r1, %r5;
- setp.lt.s32 %p2, %r2, %r3;
+ div.s32 %r2, %r1, %r3;
+ setp.lt.s32 %p1, %r2, %r5;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB3_2;
bra.uni BB3_1;
BB3_1:
+ rem.s32 %r9, %r1, %r3;
cvta.to.global.u64 %rd4, %rd1;
- mad.lo.s32 %r12, %r1, %r3, %r2;
- mul.wide.s32 %rd5, %r12, 8;
+ mad.lo.s32 %r10, %r2, %r3, %r9;
+ mul.wide.s32 %rd5, %r10, 8;
add.s64 %rd6, %rd4, %rd5;
- div.s32 %r13, %r2, %r4;
+ div.s32 %r11, %r9, %r4;
cvta.to.global.u64 %rd7, %rd2;
- mul.wide.s32 %rd8, %r13, 8;
+ mul.wide.s32 %rd8, %r11, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
@@ -248,7 +239,7 @@ BB3_2:
)
{
.reg .pred %p<5>;
- .reg .b32 %r<13>;
+ .reg .b32 %r<11>;
.reg .f64 %fd<7>;
.reg .b64 %rd<14>;
@@ -264,22 +255,20 @@ BB3_2:
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %tid.x;
- mad.lo.s32 %r1, %r6, %r7, %r8;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %ctaid.y;
- mov.u32 %r11, %tid.y;
- mad.lo.s32 %r2, %r9, %r10, %r11;
+ mad.lo.s32 %r9, %r6, %r7, %r8;
+ div.s32 %r1, %r9, %r3;
+ rem.s32 %r2, %r9, %r3;
setp.lt.s32 %p1, %r1, %r5;
- setp.lt.s32 %p2, %r2, %r3;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB4_4;
bra.uni BB4_1;
BB4_1:
cvta.to.global.u64 %rd6, %rd4;
- mad.lo.s32 %r12, %r1, %r3, %r2;
+ mad.lo.s32 %r10, %r1, %r3, %r2;
cvta.to.global.u64 %rd7, %rd3;
- mul.wide.s32 %rd8, %r12, 8;
+ mul.wide.s32 %rd8, %r10, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
add.s64 %rd2, %rd6, %rd8;
@@ -317,7 +306,7 @@ BB4_4:
)
{
.reg .pred %p<4>;
- .reg .b32 %r<14>;
+ .reg .b32 %r<12>;
.reg .f64 %fd<4>;
.reg .b64 %rd<12>;
@@ -332,24 +321,22 @@ BB4_4:
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r1, %r7, %r6, %r8;
- mov.u32 %r9, %ntid.y;
- mov.u32 %r10, %ctaid.y;
- mov.u32 %r11, %tid.y;
- mad.lo.s32 %r2, %r9, %r10, %r11;
- setp.lt.s32 %p1, %r1, %r5;
- setp.lt.s32 %p2, %r2, %r3;
+ div.s32 %r2, %r1, %r3;
+ setp.lt.s32 %p1, %r2, %r5;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB5_2;
bra.uni BB5_1;
BB5_1:
+ rem.s32 %r9, %r1, %r3;
cvta.to.global.u64 %rd4, %rd1;
- mad.lo.s32 %r12, %r1, %r3, %r2;
- mul.wide.s32 %rd5, %r12, 8;
+ mad.lo.s32 %r10, %r2, %r3, %r9;
+ mul.wide.s32 %rd5, %r10, 8;
add.s64 %rd6, %rd4, %rd5;
- div.s32 %r13, %r2, %r4;
+ div.s32 %r11, %r9, %r4;
cvta.to.global.u64 %rd7, %rd2;
- mul.wide.s32 %rd8, %r13, 8;
+ mul.wide.s32 %rd8, %r11, 8;
add.s64 %rd9, %rd7, %rd8;
ld.global.f64 %fd1, [%rd9];
ld.global.f64 %fd2, [%rd6];
@@ -376,7 +363,7 @@ BB5_2:
)
{
.reg .pred %p<6>;
- .reg .b32 %r<12>;
+ .reg .b32 %r<10>;
.reg .f64 %fd<9>;
.reg .b64 %rd<8>;
@@ -394,13 +381,11 @@ BB5_2:
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r7, %r5, %r4, %r6;
- mov.u32 %r8, %ntid.y;
- mov.u32 %r9, %ctaid.y;
- mov.u32 %r10, %tid.y;
- mad.lo.s32 %r11, %r8, %r9, %r10;
- mad.lo.s32 %r1, %r7, %r3, %r11;
- setp.lt.s32 %p1, %r7, %r2;
- setp.lt.s32 %p2, %r11, %r3;
+ div.s32 %r8, %r7, %r3;
+ rem.s32 %r9, %r7, %r3;
+ mad.lo.s32 %r1, %r8, %r3, %r9;
+ setp.lt.s32 %p1, %r8, %r2;
+ setp.gt.s32 %p2, %r3, -1;
and.pred %p3, %p1, %p2;
@!%p3 bra BB6_6;
bra.uni BB6_1;
@@ -451,7 +436,7 @@ BB6_6:
)
{
.reg .pred %p<73>;
- .reg .b32 %r<68>;
+ .reg .b32 %r<66>;
.reg .f64 %fd<56>;
.reg .b64 %rd<19>;
@@ -467,13 +452,11 @@ BB6_6:
mov.u32 %r15, %ntid.x;
mov.u32 %r16, %ctaid.x;
mov.u32 %r17, %tid.x;
- mad.lo.s32 %r1, %r15, %r16, %r17;
- mov.u32 %r18, %ntid.y;
- mov.u32 %r19, %ctaid.y;
- mov.u32 %r20, %tid.y;
- mad.lo.s32 %r2, %r18, %r19, %r20;
+ mad.lo.s32 %r18, %r15, %r16, %r17;
+ div.s32 %r1, %r18, %r10;
+ rem.s32 %r2, %r18, %r10;
setp.lt.s32 %p2, %r1, %r14;
- setp.lt.s32 %p3, %r2, %r10;
+ setp.gt.s32 %p3, %r10, -1;
and.pred %p4, %p2, %p3;
@!%p4 bra BB7_77;
bra.uni BB7_1;
@@ -481,34 +464,34 @@ BB6_6:
BB7_1:
mad.lo.s32 %r3, %r1, %r10, %r2;
setp.eq.s32 %p5, %r11, 1;
- mov.u32 %r66, %r1;
+ mov.u32 %r64, %r1;
@%p5 bra BB7_5;
setp.ne.s32 %p6, %r11, 2;
- mov.u32 %r67, %r3;
+ mov.u32 %r65, %r3;
@%p6 bra BB7_4;
- mov.u32 %r67, %r2;
+ mov.u32 %r65, %r2;
BB7_4:
- mov.u32 %r61, %r67;
- mov.u32 %r4, %r61;
- mov.u32 %r66, %r4;
+ mov.u32 %r59, %r65;
+ mov.u32 %r4, %r59;
+ mov.u32 %r64, %r4;
BB7_5:
- mov.u32 %r5, %r66;
+ mov.u32 %r5, %r64;
setp.eq.s32 %p7, %r12, 1;
- mov.u32 %r64, %r1;
+ mov.u32 %r62, %r1;
@%p7 bra BB7_9;
setp.ne.s32 %p8, %r12, 2;
- mov.u32 %r65, %r3;
+ mov.u32 %r63, %r3;
@%p8 bra BB7_8;
- mov.u32 %r65, %r2;
+ mov.u32 %r63, %r2;
BB7_8:
- mov.u32 %r64, %r65;
+ mov.u32 %r62, %r63;
BB7_9:
cvta.to.global.u64 %rd5, %rd3;
@@ -516,7 +499,7 @@ BB7_9:
mul.wide.s32 %rd7, %r5, 8;
add.s64 %rd8, %rd6, %rd7;
ld.global.f64 %fd1, [%rd8];
- mul.wide.s32 %rd9, %r64, 8;
+ mul.wide.s32 %rd9, %r62, 8;
add.s64 %rd10, %rd5, %rd9;
ld.global.f64 %fd2, [%rd10];
mov.f64 %fd55, 0d7FEFFFFFFFFFFFFF;
@@ -570,10 +553,10 @@ BB7_58:
.reg .b32 %temp;
mov.b64 {%temp, %r9}, %fd2;
}
- bfe.u32 %r33, %r9, 20, 11;
- add.s32 %r34, %r33, -1012;
+ bfe.u32 %r31, %r9, 20, 11;
+ add.s32 %r32, %r31, -1012;
mov.b64 %rd15, %fd2;
- shl.b64 %rd1, %rd15, %r34;
+ shl.b64 %rd1, %rd15, %r32;
setp.eq.s64 %p53, %rd1, -9223372036854775808;
abs.f64 %fd19, %fd1;
// Callseq Start 0
@@ -603,14 +586,14 @@ BB7_58:
BB7_59:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r35}, %fd54;
+ mov.b64 {%temp, %r33}, %fd54;
}
- xor.b32 %r36, %r35, -2147483648;
+ xor.b32 %r34, %r33, -2147483648;
{
.reg .b32 %temp;
- mov.b64 {%r37, %temp}, %fd54;
+ mov.b64 {%r35, %temp}, %fd54;
}
- mov.b64 %fd54, {%r37, %r36};
+ mov.b64 %fd54, {%r35, %r34};
BB7_60:
mov.f64 %fd53, %fd54;
@@ -619,12 +602,12 @@ BB7_60:
bra.uni BB7_61;
BB7_63:
- selp.b32 %r38, %r8, 0, %p53;
- or.b32 %r39, %r38, 2146435072;
+ selp.b32 %r36, %r8, 0, %p53;
+ or.b32 %r37, %r36, 2146435072;
setp.lt.s32 %p59, %r9, 0;
- selp.b32 %r40, %r39, %r38, %p59;
- mov.u32 %r41, 0;
- mov.b64 %fd53, {%r41, %r40};
+ selp.b32 %r38, %r37, %r36, %p59;
+ mov.u32 %r39, 0;
+ mov.b64 %fd53, {%r39, %r38};
bra.uni BB7_64;
BB7_35:
@@ -638,10 +621,10 @@ BB7_35:
BB7_52:
cvt.rni.s64.f64 %rd11, %fd1;
cvt.rni.s64.f64 %rd12, %fd2;
- cvt.u32.u64 %r27, %rd11;
- cvt.u32.u64 %r28, %rd12;
- or.b32 %r29, %r28, %r27;
- setp.eq.s32 %p45, %r29, 0;
+ cvt.u32.u64 %r25, %rd11;
+ cvt.u32.u64 %r26, %rd12;
+ or.b32 %r27, %r26, %r25;
+ setp.eq.s32 %p45, %r27, 0;
selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p45;
bra.uni BB7_76;
@@ -701,17 +684,17 @@ BB7_46:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r24}, %fd55;
+ mov.b64 {%temp, %r22}, %fd55;
}
- and.b32 %r25, %r24, 2147483647;
- setp.ne.s32 %p42, %r25, 2146435072;
+ and.b32 %r23, %r22, 2147483647;
+ setp.ne.s32 %p42, %r23, 2146435072;
@%p42 bra BB7_50;
{
.reg .b32 %temp;
- mov.b64 {%r26, %temp}, %fd55;
+ mov.b64 {%r24, %temp}, %fd55;
}
- setp.eq.s32 %p43, %r26, 0;
+ setp.eq.s32 %p43, %r24, 0;
@%p43 bra BB7_76;
BB7_50:
@@ -781,10 +764,10 @@ BB7_33:
BB7_34:
cvt.rni.s64.f64 %rd13, %fd1;
cvt.rni.s64.f64 %rd14, %fd2;
- cvt.u32.u64 %r30, %rd13;
- cvt.u32.u64 %r31, %rd14;
- and.b32 %r32, %r31, %r30;
- setp.eq.s32 %p46, %r32, 0;
+ cvt.u32.u64 %r28, %rd13;
+ cvt.u32.u64 %r29, %rd14;
+ and.b32 %r30, %r29, %r28;
+ setp.eq.s32 %p46, %r30, 0;
selp.f64 %fd55, 0d0000000000000000, 0d3FF0000000000000, %p46;
bra.uni BB7_76;
@@ -820,17 +803,17 @@ BB7_41:
{
.reg .b32 %temp;
- mov.b64 {%temp, %r21}, %fd55;
+ mov.b64 {%temp, %r19}, %fd55;
}
- and.b32 %r22, %r21, 2147483647;
- setp.ne.s32 %p36, %r22, 2146435072;
+ and.b32 %r20, %r19, 2147483647;
+ setp.ne.s32 %p36, %r20, 2146435072;
@%p36 bra BB7_45;
{
.reg .b32 %temp;
- mov.b64 {%r23, %temp}, %fd55;
+ mov.b64 {%r21, %temp}, %fd55;
}
- setp.eq.s32 %p37, %r23, 0;
+ setp.eq.s32 %p37, %r21, 0;
@%p37 bra BB7_76;
BB7_45:
@@ -850,10 +833,10 @@ BB7_64:
add.f64 %fd26, %fd1, %fd2;
{
.reg .b32 %temp;
- mov.b64 {%temp, %r42}, %fd26;
+ mov.b64 {%temp, %r40}, %fd26;
}
- and.b32 %r43, %r42, 2146435072;
- setp.ne.s32 %p60, %r43, 2146435072;
+ and.b32 %r41, %r40, 2146435072;
+ setp.ne.s32 %p60, %r41, 2146435072;
mov.f64 %fd52, %fd25;
@%p60 bra BB7_73;
@@ -867,51 +850,51 @@ BB7_64:
mov.f64 %fd52, %fd51;
@%p62 bra BB7_73;
- and.b32 %r44, %r9, 2147483647;
- setp.ne.s32 %p63, %r44, 2146435072;
+ and.b32 %r42, %r9, 2147483647;
+ setp.ne.s32 %p63, %r42, 2146435072;
@%p63 bra BB7_69;
{
.reg .b32 %temp;
- mov.b64 {%r45, %temp}, %fd2;
+ mov.b64 {%r43, %temp}, %fd2;
}
- setp.eq.s32 %p64, %r45, 0;
+ setp.eq.s32 %p64, %r43, 0;
@%p64 bra BB7_72;
BB7_69:
- and.b32 %r46, %r8, 2147483647;
- setp.ne.s32 %p65, %r46, 2146435072;
+ and.b32 %r44, %r8, 2147483647;
+ setp.ne.s32 %p65, %r44, 2146435072;
mov.f64 %fd49, %fd25;
mov.f64 %fd52, %fd49;
@%p65 bra BB7_73;
{
.reg .b32 %temp;
- mov.b64 {%r47, %temp}, %fd1;
+ mov.b64 {%r45, %temp}, %fd1;
}
- setp.ne.s32 %p66, %r47, 0;
+ setp.ne.s32 %p66, %r45, 0;
mov.f64 %fd52, %fd25;
@%p66 bra BB7_73;
- shr.s32 %r48, %r9, 31;
- and.b32 %r49, %r48, -2146435072;
- add.s32 %r50, %r49, 2146435072;
- or.b32 %r51, %r50, -2147483648;
- selp.b32 %r52, %r51, %r50, %p1;
- mov.u32 %r53, 0;
- mov.b64 %fd52, {%r53, %r52};
+ shr.s32 %r46, %r9, 31;
+ and.b32 %r47, %r46, -2146435072;
+ add.s32 %r48, %r47, 2146435072;
+ or.b32 %r49, %r48, -2147483648;
+ selp.b32 %r50, %r49, %r48, %p1;
+ mov.u32 %r51, 0;
+ mov.b64 %fd52, {%r51, %r50};
bra.uni BB7_73;
BB7_72:
setp.gt.f64 %p67, %fd19, 0d3FF0000000000000;
- selp.b32 %r54, 2146435072, 0, %p67;
- xor.b32 %r55, %r54, 2146435072;
+ selp.b32 %r52, 2146435072, 0, %p67;
+ xor.b32 %r53, %r52, 2146435072;
setp.lt.s32 %p68, %r9, 0;
- selp.b32 %r56, %r55, %r54, %p68;
+ selp.b32 %r54, %r53, %r52, %p68;
setp.eq.f64 %p69, %fd1, 0dBFF0000000000000;
- selp.b32 %r57, 1072693248, %r56, %p69;
- mov.u32 %r58, 0;
- mov.b64 %fd52, {%r58, %r57};
+ selp.b32 %r55, 1072693248, %r54, %p69;
+ mov.u32 %r56, 0;
+ mov.b64 %fd52, {%r56, %r55};
BB7_73:
setp.eq.f64 %p70, %fd2, 0d0000000000000000;
@@ -1825,7 +1808,7 @@ BB9_2:
)
{
.reg .pred %p<7>;
- .reg .b32 %r<19>;
+ .reg .b32 %r<18>;
.reg .f64 %fd<3>;
.reg .b64 %rd<15>;
@@ -1841,11 +1824,10 @@ BB9_2:
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
- mad.lo.s32 %r1, %r8, %r9, %r10;
- mov.u32 %r11, %ntid.y;
- mov.u32 %r12, %ctaid.y;
- mov.u32 %r13, %tid.y;
- mad.lo.s32 %r2, %r11, %r12, %r13;
+ mad.lo.s32 %r11, %r8, %r9, %r10;
+ max.s32 %r12, %r4, %r6;
+ div.s32 %r1, %r11, %r12;
+ rem.s32 %r2, %r11, %r12;
add.s32 %r3, %r6, %r4;
setp.lt.s32 %p1, %r1, %r7;
setp.lt.s32 %p2, %r2, %r4;
@@ -1855,12 +1837,12 @@ BB9_2:
BB10_1:
cvta.to.global.u64 %rd5, %rd2;
- mad.lo.s32 %r14, %r1, %r4, %r2;
- mul.wide.s32 %rd6, %r14, 8;
+ mad.lo.s32 %r13, %r1, %r4, %r2;
+ mul.wide.s32 %rd6, %r13, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
- mad.lo.s32 %r15, %r1, %r3, %r2;
- mul.wide.s32 %rd8, %r15, 8;
+ mad.lo.s32 %r14, %r1, %r3, %r2;
+ mul.wide.s32 %rd8, %r14, 8;
add.s64 %rd9, %rd1, %rd8;
st.global.f64 [%rd9], %fd1;
@@ -1873,13 +1855,13 @@ BB10_2:
BB10_3:
cvta.to.global.u64 %rd10, %rd3;
- mad.lo.s32 %r16, %r1, %r6, %r2;
- mul.wide.s32 %rd11, %r16, 8;
+ mad.lo.s32 %r15, %r1, %r6, %r2;
+ mul.wide.s32 %rd11, %r15, 8;
add.s64 %rd12, %rd10, %rd11;
ld.global.f64 %fd2, [%rd12];
- mad.lo.s32 %r17, %r1, %r3, %r4;
- add.s32 %r18, %r17, %r2;
- mul.wide.s32 %rd13, %r18, 8;
+ add.s32 %r16, %r2, %r4;
+ mad.lo.s32 %r17, %r1, %r3, %r16;
+ mul.wide.s32 %rd13, %r17, 8;
add.s64 %rd14, %rd1, %rd13;
st.global.f64 [%rd14], %fd2;
@@ -1899,7 +1881,7 @@ BB10_4:
)
{
.reg .pred %p<7>;
- .reg .b32 %r<17>;
+ .reg .b32 %r<16>;
.reg .f64 %fd<3>;
.reg .b64 %rd<14>;
@@ -1915,11 +1897,10 @@ BB10_4:
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %tid.x;
- mad.lo.s32 %r1, %r7, %r8, %r9;
- mov.u32 %r10, %ntid.y;
- mov.u32 %r11, %ctaid.y;
- mov.u32 %r12, %tid.y;
- mad.lo.s32 %r2, %r10, %r11, %r12;
+ mad.lo.s32 %r10, %r7, %r8, %r9;
+ max.s32 %r11, %r4, %r6;
+ div.s32 %r1, %r10, %r11;
+ rem.s32 %r2, %r10, %r11;
setp.lt.s32 %p1, %r1, %r3;
setp.lt.s32 %p2, %r2, %r4;
and.pred %p3, %p1, %p2;
@@ -1928,8 +1909,8 @@ BB10_4:
BB11_1:
cvta.to.global.u64 %rd5, %rd2;
- mad.lo.s32 %r13, %r1, %r4, %r2;
- mul.wide.s32 %rd6, %r13, 8;
+ mad.lo.s32 %r12, %r1, %r4, %r2;
+ mul.wide.s32 %rd6, %r12, 8;
add.s64 %rd7, %rd5, %rd6;
ld.global.f64 %fd1, [%rd7];
add.s64 %rd8, %rd1, %rd6;
@@ -1944,13 +1925,13 @@ BB11_2:
BB11_3:
cvta.to.global.u64 %rd9, %rd3;
- mad.lo.s32 %r14, %r1, %r6, %r2;
- mul.wide.s32 %rd10, %r14, 8;
+ mad.lo.s32 %r13, %r1, %r6, %r2;
+ mul.wide.s32 %rd10, %r13, 8;
add.s64 %rd11, %rd9, %rd10;
ld.global.f64 %fd2, [%rd11];
- add.s32 %r15, %r1, %r3;
- mad.lo.s32 %r16, %r15, %r4, %r2;
- mul.wide.s32 %rd12, %r16, 8;
+ add.s32 %r14, %r1, %r3;
+ mad.lo.s32 %r15, %r14, %r4, %r2;
+ mul.wide.s32 %rd12, %r15, 8;
add.s64 %rd13, %rd1, %rd12;
st.global.f64 [%rd13], %fd2;
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
index a2d361c..169c3bb 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/ParForProgramBlock.java
@@ -828,9 +828,6 @@ public class ParForProgramBlock extends ForProgramBlock
// Frees up the GPUContexts used in the threaded Parfor and sets
// the main thread to use the GPUContext
if (DMLScript.USE_ACCELERATOR) {
- for (int i = 0; i < _numThreads; i++) {
- workers[i].getExecutionContext().setGPUContexts(null);
- }
ec.getGPUContext(0).initializeThread();
}
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
index 636b1f8..f77c22e 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/LocalParWorker.java
@@ -25,6 +25,7 @@ import org.apache.sysml.api.DMLScript;
import org.apache.sysml.conf.CompilerConfig;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.controlprogram.parfor.stat.Stat;
import org.apache.sysml.runtime.controlprogram.parfor.stat.StatisticMonitor;
@@ -82,8 +83,15 @@ public class LocalParWorker extends ParWorker implements Runnable
}
// Initialize this GPUContext to this thread
- if (DMLScript.USE_ACCELERATOR)
- _ec.getGPUContext(0).initializeThread();
+ if (DMLScript.USE_ACCELERATOR) {
+ try {
+ _ec.getGPUContext(0).initializeThread();
+ } catch(DMLRuntimeException e) {
+ LOG.error("Error executing task because of failure in GPU backend: ",e);
+ LOG.error("Stopping LocalParWorker.");
+ return;
+ }
+ }
//setup compiler config for worker thread
ConfigurationManager.setLocalConfig(_cconf);
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
index 3cd2633..77c48a7 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/FunctionCallCPInstruction.java
@@ -169,7 +169,6 @@ public class FunctionCallCPInstruction extends CPInstruction
ExecutionContext fn_ec = ExecutionContextFactory.createContext(false, ec.getProgram());
if (DMLScript.USE_ACCELERATOR) {
fn_ec.setGPUContexts(ec.getGPUContexts());
- ec.setGPUContexts(null);
fn_ec.getGPUContext(0).initializeThread();
}
fn_ec.setVariables(functionVariables);
@@ -205,12 +204,6 @@ public class FunctionCallCPInstruction extends CPInstruction
// Unpin the pinned variables
ec.unpinVariables(_boundInputParamNames, pinStatus);
- if (DMLScript.USE_ACCELERATOR) {
- ec.setGPUContexts(fn_ec.getGPUContexts());
- fn_ec.setGPUContexts(null);
- ec.getGPUContext(0).initializeThread();
- }
-
// add the updated binding for each return variable to the variables in original symbol table
for (int i=0; i< fpb.getOutputParams().size(); i++){
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
index ef000c2..5a0a772 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/ExecutionConfig.java
@@ -89,16 +89,34 @@ public class ExecutionConfig {
* @return execution configuration
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- public static ExecutionConfig getConfigForSimpleMatrixOperations(int rlen, int clen) throws DMLRuntimeException {
+ public static ExecutionConfig getConfigForMatrixOperations(int rlen, int clen) throws DMLRuntimeException {
int deviceNumber = 0;
int maxBlockDim = getMaxBlockDim(deviceNumber);
int blockDimX = (int) Math.min(maxBlockDim, rlen);
int gridDimX = (int) Math.ceil((double) rlen / blockDimX);
int blockDimY = (int) Math.min(Math.floor(((double) maxBlockDim) / blockDimX), clen);
int gridDimY = (int) Math.ceil((double) clen / blockDimY);
+ if (gridDimY > 65535)
+ throw new DMLRuntimeException("Internal Error: gridDimY must be less than 65535 for all supported CUDA compute capabilites!");
return new ExecutionConfig(gridDimX, gridDimY, blockDimX, blockDimY);
}
+ /**
+ * Use this for simple vector operations and use following in the kernel
+ * <code>
+ * int index = blockIdx.x * blockDim.x + threadIdx.x
+ * </code>
+ * <p>
+ * @param rlen number of rows
+ * @param clen number of columns
+ * @return execution configuration
+ * @throws DMLRuntimeException if DMLRuntimeException occurs
+ */
+ public static ExecutionConfig getConfigForSimpleMatrixOperations(int rlen, int clen) throws DMLRuntimeException {
+ return getConfigForSimpleVectorOperations(rlen * clen);
+ }
+
+
public ExecutionConfig(int gridDimX, int blockDimX) {
this.gridDimX = gridDimX;
this.blockDimX = blockDimX;
@@ -134,4 +152,10 @@ public class ExecutionConfig {
return ret;
}
+ @Override
+ public String toString() {
+ return "ExecutionConfig{" + "gridDimX=" + gridDimX + ", gridDimY=" + gridDimY + ", gridDimZ=" + gridDimZ
+ + ", blockDimX=" + blockDimX + ", blockDimY=" + blockDimY + ", blockDimZ=" + blockDimZ
+ + ", sharedMemBytes=" + sharedMemBytes + '}';
+ }
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index b3c19ef..4c0562d 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -108,27 +108,27 @@ public class GPUContext {
/**
* cudnnHandle for Deep Neural Network operations on the GPU
*/
- private cudnnHandle cudnnHandle;
+ private final ThreadLocal<cudnnHandle> cudnnHandle = new ThreadLocal<>();
/**
* cublasHandle for BLAS operations on the GPU
*/
- private cublasHandle cublasHandle;
+ private final ThreadLocal<cublasHandle> cublasHandle = new ThreadLocal<>();
/**
* cusparseHandle for certain sparse BLAS operations on the GPU
*/
- private cusparseHandle cusparseHandle;
+ private final ThreadLocal<cusparseHandle> cusparseHandle = new ThreadLocal<>();
/**
* cusolverDnHandle for invoking solve() function on dense matrices on the GPU
*/
- private cusolverDnHandle cusolverDnHandle;
+ private final ThreadLocal<cusolverDnHandle> cusolverDnHandle = new ThreadLocal<>();
/**
* cusolverSpHandle for invoking solve() function on sparse matrices on the GPU
*/
- private cusolverSpHandle cusolverSpHandle;
+ private final ThreadLocal<cusolverSpHandle> cusolverSpHandle = new ThreadLocal<>();
/**
* to launch custom CUDA kernel, specific to the active GPU for this GPUContext
*/
- private JCudaKernels kernels;
+ private final ThreadLocal<JCudaKernels> kernels = new ThreadLocal<>();
protected GPUContext(int deviceNum) throws DMLRuntimeException {
this.deviceNum = deviceNum;
@@ -140,28 +140,51 @@ public class GPUContext {
long total[] = { 0 };
cudaMemGetInfo(free, total);
- long start = System.nanoTime();
- cudnnHandle = new cudnnHandle();
- cudnnCreate(cudnnHandle);
- cublasHandle = new cublasHandle();
- cublasCreate(cublasHandle);
+ long start = -1;
+ if (DMLScript.STATISTICS)
+ start = System.nanoTime();
+ initializeCudaLibraryHandles();
+
+ if (DMLScript.STATISTICS)
+ GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
+
+ LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
+ + this);
+
+ }
+
+ private void initializeCudaLibraryHandles() throws DMLRuntimeException {
+ if (cudnnHandle.get() == null) {
+ cudnnHandle.set(new cudnnHandle());
+ cudnnCreate(cudnnHandle.get());
+ }
+
+ if (cublasHandle.get() == null) {
+ cublasHandle.set(new cublasHandle());
+ cublasCreate(cublasHandle.get());
+ }
// For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host
// This applies to arguments like "alpha" in Dgemm, and "y" in Ddot.
// cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
- cusparseHandle = new cusparseHandle();
- cusparseCreate(cusparseHandle);
- cusolverDnHandle = new cusolverDnHandle();
- cusolverDnCreate(cusolverDnHandle);
- cusolverSpHandle = new cusolverSpHandle();
- cusolverSpCreate(cusolverSpHandle);
+ if (cusparseHandle.get() == null) {
+ cusparseHandle.set(new cusparseHandle());
+ cusparseCreate(cusparseHandle.get());
+ }
- kernels = new JCudaKernels(deviceNum);
+ if (cusolverDnHandle.get() == null) {
+ cusolverDnHandle.set(new cusolverDnHandle());
+ cusolverDnCreate(cusolverDnHandle.get());
+ }
- GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start;
- LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on "
- + this);
+ if (cusolverSpHandle.get() == null) {
+ cusolverSpHandle.set(new cusolverSpHandle());
+ cusolverSpCreate(cusolverSpHandle.get());
+ }
+ if (kernels.get() == null) {
+ kernels.set(new JCudaKernels());
+ }
}
public static int cudaGetDevice() {
@@ -181,8 +204,9 @@ public class GPUContext {
* If in a multi-threaded env like parfor, this method must be called when in the
* appropriate thread
*/
- public void initializeThread() {
+ public void initializeThread() throws DMLRuntimeException {
cudaSetDevice(deviceNum);
+ initializeCudaLibraryHandles();
}
/**
@@ -595,27 +619,27 @@ public class GPUContext {
}
public cudnnHandle getCudnnHandle() {
- return cudnnHandle;
+ return cudnnHandle.get();
}
public cublasHandle getCublasHandle() {
- return cublasHandle;
+ return cublasHandle.get();
}
public cusparseHandle getCusparseHandle() {
- return cusparseHandle;
+ return cusparseHandle.get();
}
public cusolverDnHandle getCusolverDnHandle() {
- return cusolverDnHandle;
+ return cusolverDnHandle.get();
}
public cusolverSpHandle getCusolverSpHandle() {
- return cusolverSpHandle;
+ return cusolverSpHandle.get();
}
public JCudaKernels getKernels() {
- return kernels;
+ return kernels.get();
}
/**
@@ -626,15 +650,11 @@ public class GPUContext {
public void destroy() throws DMLRuntimeException {
LOG.trace("GPU : this context was destroyed, this = " + this.toString());
clearMemory();
- cudnnDestroy(cudnnHandle);
- cublasDestroy(cublasHandle);
- cusparseDestroy(cusparseHandle);
- cusolverDnDestroy(cusolverDnHandle);
- cusolverSpDestroy(cusolverSpHandle);
- cudnnHandle = null;
- cublasHandle = null;
- cusparseHandle = null;
-
+ cudnnDestroy(cudnnHandle.get());
+ cublasDestroy(cublasHandle.get());
+ cusparseDestroy(cusparseHandle.get());
+ cusolverDnDestroy(cusolverDnHandle.get());
+ cusolverSpDestroy(cusolverSpHandle.get());
}
/**
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
index a9b1333..e030180 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -130,7 +130,7 @@ public class GPUContextPool {
// initially available memory is set to the GPU with the lowest memory
// This is because at runtime, we wouldn't know which GPU a certain
// operation gets scheduled on
- long minAvailableMemory = Integer.MAX_VALUE;
+ long minAvailableMemory = Long.MAX_VALUE;
for (GPUContext gCtx : pool) {
gCtx.initializeThread();
minAvailableMemory = Math.min(minAvailableMemory, gCtx.getAvailableMemory());
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index 246aecc..9cfab2b 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -49,17 +49,14 @@ public class JCudaKernels {
private final static String ptxFileName = "/kernels/SystemML.ptx";
private HashMap<String, CUfunction> kernels = new HashMap<String, CUfunction>();
private CUmodule module;
- // private final int deviceNum;
/**
* Loads the kernels in the file ptxFileName. Though cubin files are also supported, we will stick with
* ptx file as they are target-independent similar to Java's .class files.
*
- * @param deviceNum the device number for which to initiate the driver API
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
- JCudaKernels(int deviceNum) throws DMLRuntimeException {
- // this.deviceNum = deviceNum;
+ JCudaKernels() throws DMLRuntimeException {
module = new CUmodule();
// Load the kernels specified in the ptxFileName file
checkResult(cuModuleLoadDataEx(module, initKernels(ptxFileName), 0, new int[0], Pointer.to(new int[0])));
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
index 195968a..d40b7a1 100644
--- a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -98,6 +98,24 @@ public abstract class GPUTests extends AutomatedTestBase {
}
/**
+ * Generates an input matrix which is a sequence of integers
+ * @param spark valid instance of {@link SparkSession}
+ * @param m number of rows
+ * @param n number of columns
+ * @return a matrix with a sequence of integers
+ */
+ protected Matrix generateIntegerSequenceMatrix(SparkSession spark, int m, int n) {
+ MLContext genMLC = new MLContext(spark);
+ String scriptStr;
+ scriptStr = "temp = seq(1, " + (m*n) + ")" +
+ "in1 = matrix(temp, rows=" + m + ", cols=" + n + ")";
+ Script generateScript = ScriptFactory.dmlFromString(scriptStr).out("in1");
+ Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
+ genMLC.close();
+ return in1;
+ }
+
+ /**
* Generates a random input matrix with a given size and sparsity
*
* @param spark valid instance of {@link SparkSession}
http://git-wip-us.apache.org/repos/asf/systemml/blob/815ca4f2/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
index f7c7851..81bc254 100644
--- a/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
+++ b/src/test/java/org/apache/sysml/test/gpu/MatrixMultiplicationOpTest.java
@@ -153,6 +153,7 @@ public class MatrixMultiplicationOpTest extends GPUTests {
for (int j = 0; j < sparsities.length; j++) {
int side = sizes[i];
double sparsity = sparsities[j];
+ System.out.println("Transpose Self matrix multiply, size = " + side + ", sparsity = " + sparsity);
Matrix X = generateInputMatrix(spark, side, side, sparsity, seed);
HashMap<String, Object> inputs = new HashMap<>();
inputs.put("X", X);