You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2018/01/30 18:45:29 UTC
[incubator-mxnet] branch master updated: Expand gpu-kernel-launch
synchronous error checking. (#9560)
This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new a0a52b3 Expand gpu-kernel-launch synchronous error checking. (#9560)
a0a52b3 is described below
commit a0a52b38a9046a3ede20c900ff64154642e8e2da
Author: Dick Carter <di...@comcast.net>
AuthorDate: Tue Jan 30 10:45:25 2018 -0800
Expand gpu-kernel-launch synchronous error checking. (#9560)
* Expand gpu-kernel-launch synchronous error checking.
* Added python test that passes only after PR code is merged.
* Print SKIP message to stderr so it appears in CI log.
* Add python version to test skip message.
* Added diagnostic info when skipping test.
* Improve robustness of test.
---
src/common/random_generator.cu | 1 +
src/operator/contrib/count_sketch.cu | 2 ++
src/operator/contrib/ctc_include/detail/gpu_ctc.h | 4 +++
src/operator/linalg_impl.h | 2 ++
src/operator/mxnet_op.h | 2 ++
src/operator/nn/sequence_mask-inl.h | 1 +
src/operator/nn/softmax-inl.h | 2 ++
src/operator/pad.cu | 12 +++++++
src/operator/roi_pooling.cu | 2 ++
src/operator/svm_output.cu | 2 ++
src/operator/tensor/broadcast_reduce-inl.cuh | 6 ++++
tests/python/gpu/test_operator_gpu.py | 38 +++++++++++++++++++++++
12 files changed, 74 insertions(+)
diff --git a/src/common/random_generator.cu b/src/common/random_generator.cu
index f6f31cf..930e5e0 100644
--- a/src/common/random_generator.cu
+++ b/src/common/random_generator.cu
@@ -55,6 +55,7 @@ void RandGenerator<gpu, float>::Seed(mshadow::Stream<gpu> *s, uint32_t seed) {
states_,
RandGenerator<gpu, float>::kNumRandomStates,
seed);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(rand_generator_seed_kernel);
s->Wait();
}
diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
index b849f4c..373ff3e 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -129,6 +129,7 @@ inline void CountSketchForward(const Tensor<gpu, 2, DType> &out,
nthreads, out_ptr+bstart*out_dim, h_ptr,
s_ptr, in_ptr+bstart*in_dim, batchlen,
in_dim, out_dim);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_forward_kernel);
// cudaThreadSynchronize();
bstart = (i+1)*batchlen;
}
@@ -164,6 +165,7 @@ inline void CountSketchBackward(const Tensor<gpu, 2, DType> &in_grad,
nthreads, in_grad_ptr+bstart*in_dim, h_ptr,
s_ptr, out_grad_ptr+bstart*out_dim, batchlen,
in_dim, out_dim);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_backward_kernel);
bstart = (i+1)*batchlen;
}
}
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
index c9cab80..c249046 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
@@ -404,6 +404,10 @@ GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {
(ctc_helper::identity<ProbT>(), log_probs_,
denoms_, out_dim_, num_elements);
+ cuda_status = cudaGetLastError();
+ if (cuda_status != cudaSuccess)
+ return CTC_STATUS_EXECUTION_FAILED;
+
return CTC_STATUS_SUCCESS;
}
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index b2a672f..d128617 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -641,6 +641,7 @@ void linalg_potri<gpu, DType>(const Tensor<gpu, 2, DType>& A, bool lower, Stream
static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType *>(buffer.dptr), A.MSize(), A.stride_, A.MSize()); \
+ MSHADOW_CUDA_POST_KERNEL_CHECK(linalgInitIdentityGPU); \
Tensor<gpu, 2, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
linalg_trsm(A, B, DType(1.0), false, lower, !lower, s); \
linalg_trsm(A, B, DType(1.0), false, lower, lower, s); \
@@ -664,6 +665,7 @@ void linalg_batch_potri<gpu, DType>(const Tensor<gpu, 3, DType>& A, bool lower,
static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType *>(buffer.dptr), A.size(1)*A.stride_, A.stride_, A.MSize()); \
+ MSHADOW_CUDA_POST_KERNEL_CHECK(linalgInitIdentityGPU); \
Tensor<gpu, 3, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
linalg_batch_trsm(A, B, DType(1.0), false, lower, !lower, s); \
linalg_batch_trsm(A, B, DType(1.0), false, lower, lower, s); \
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 5a36954..cd52524 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -563,6 +563,7 @@ struct Kernel<OP, gpu> {
mxnet_generic_kernel<OP, Args...>
<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
N, args...);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel);
}
template<typename ...Args>
@@ -572,6 +573,7 @@ struct Kernel<OP, gpu> {
mxnet_generic_kernel_ex<OP, Args...>
<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
N, args...);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel_ex);
}
};
#endif // __CUDACC__
diff --git a/src/operator/nn/sequence_mask-inl.h b/src/operator/nn/sequence_mask-inl.h
index a3b41f6..df98116 100644
--- a/src/operator/nn/sequence_mask-inl.h
+++ b/src/operator/nn/sequence_mask-inl.h
@@ -66,6 +66,7 @@ inline void SequenceMask(const mshadow::Tensor<gpu, 3, DType> &dst,
CheckLaunchParam(dimGrid, dimBlock, "SequenceMask");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
SequenceMaskKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, lengths, value);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(SequenceMaskKernel);
}
#endif
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 2badecf..080bc08 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -173,6 +173,7 @@ inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
softmax_compute_kernel<x_bits, OP, DType, ndim>
<<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
in, out, M, axis, sshape, stride);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
}
@@ -216,6 +217,7 @@ inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
softmax_gradient_kernel<x_bits, OP1, OP2, DType, ndim>
<<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
out, ograd, igrad, M, axis, sshape, stride);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_gradient_kernel);
}
#endif
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 54242a4..372683a 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -78,6 +78,7 @@ inline void image_pad_edge(Tensor<gpu, 4, DType> dst,
image_2d_pad_edge_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src,
padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_kernel);
}
template <int n_bits, typename DType>
@@ -119,6 +120,7 @@ inline void image_pad_edge_grad(Tensor<gpu, 4, DType> grad_in,
image_2d_pad_edge_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_grad_kernel);
}
// Case 2: Constant Padding
@@ -166,6 +168,7 @@ inline void image_pad_constant(Tensor<gpu, 4, DType> dst,
image_2d_pad_constant_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
dst, src, padT, padL, constant);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_kernel);
}
template <int n_bits, typename DType>
@@ -202,6 +205,7 @@ inline void image_pad_constant_grad(Tensor<gpu, 4, DType> grad_in,
image_2d_pad_constant_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_grad_kernel);
}
@@ -257,6 +261,7 @@ inline void image_pad_reflect(Tensor<gpu, 4, DType> dst,
image_2d_pad_reflect_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src,
padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_kernel);
}
template <int n_bits, typename DType>
@@ -307,6 +312,7 @@ inline void image_pad_reflect_grad(Tensor<gpu, 4, DType> grad_in,
image_2d_pad_reflect_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_grad_kernel);
}
@@ -365,6 +371,7 @@ inline void image_pad_edge(Tensor<gpu, 5, DType> dst,
image_3d_pad_edge_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
dst, src, padF, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_kernel);
}
template <int n_bits, typename DType>
@@ -416,6 +423,7 @@ inline void image_pad_edge_grad(Tensor<gpu, 5, DType> grad_in,
image_3d_pad_edge_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padF, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_grad_kernel);
}
// Case 2: Constant Padding
@@ -473,6 +481,7 @@ inline void image_pad_constant(Tensor<gpu, 5, DType> dst,
image_3d_pad_constant_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
dst, src, padF, padT, padL, constant);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_kernel);
}
template <int n_bits, typename DType>
@@ -515,6 +524,7 @@ inline void image_pad_constant_grad(Tensor<gpu, 5, DType> grad_in,
image_3d_pad_constant_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padF, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_grad_kernel);
}
// Case 3: Reflection Padding
@@ -578,6 +588,7 @@ inline void image_pad_reflect(Tensor<gpu, 5, DType> dst,
image_3d_pad_reflect_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
dst, src, padF, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_kernel);
}
template <int n_bits, typename DType>
@@ -670,6 +681,7 @@ inline void image_pad_reflect_grad(Tensor<gpu, 5, DType> grad_in,
image_3d_pad_reflect_grad_kernel<kBaseThreadBits,
DType><<<dimGrid, dimBlock, 0, stream>>>(
grad_in, grad_out, padF, padT, padL);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_grad_kernel);
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/src/operator/roi_pooling.cu b/src/operator/roi_pooling.cu
index 0f637b0..066c2ff 100644
--- a/src/operator/roi_pooling.cu
+++ b/src/operator/roi_pooling.cu
@@ -129,6 +129,7 @@ inline void ROIPoolForward(const Tensor<gpu, 4, Dtype> &out,
ROIPoolForwardKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
count, bottom_data, spatial_scale, channels, height, width,
pooled_height, pooled_width, bottom_rois, top_data, argmax_data);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(ROIPoolForwardKernel);
}
template<typename Dtype>
@@ -232,6 +233,7 @@ inline void ROIPoolBackwardAcc(const Tensor<gpu, 4, Dtype> &in_grad,
ROIPoolBackwardAccKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
count, top_diff, argmax_data, num_rois, spatial_scale, channels, height, width,
pooled_height, pooled_width, bottom_diff, bottom_rois);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(ROIPoolBackwardAccKernel);
}
} // namespace cuda
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
index d950107..fa11a6c 100644
--- a/src/operator/svm_output.cu
+++ b/src/operator/svm_output.cu
@@ -62,6 +62,7 @@ inline void L1_SVM(const DType & margin,
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
L1_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
(margin, reg_coef, dst, label, src);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(L1_SVMKernel);
}
@@ -98,6 +99,7 @@ inline void L2_SVM(const DType & margin,
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
L2_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
(margin, reg_coef, dst, label, src);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(L2_SVMKernel);
}
} // namespace mshadow
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index f5d0152..630fef6 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -507,6 +507,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
<<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
config.N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
small.shape_.get<ndim>());
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
} else {
DType* small_dptr = small.dptr<DType>();
@@ -531,11 +532,13 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
config.kernel_1.do_transpose);
});
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
if (config.Mnext > 1) {
reduce_lines_kernel<Reducer, DType>
<<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
(config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
}
}
}
@@ -550,6 +553,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const
config.N, req == kAddTo, big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(),
small.dptr<DType>(), big.shape_.get<ndim>(), lhs.shape_.get<ndim>(),
rhs.shape_.get<ndim>(), small.shape_.get<ndim>());
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
} else {
DType* small_dptr = small.dptr<DType>();
bool addto = (req == kAddTo);
@@ -574,12 +578,14 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const
rhs.shape_.get<ndim>(), small.shape_.get<ndim>(), config.rshape, config.lhs_shape,
config.rhs_shape, config.rstride, config.lhs_stride, config.rhs_stride, config.Mnext,
config.kernel_1.do_transpose);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
});
if (config.Mnext > 1) {
reduce_lines_kernel<Reducer, DType>
<<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
(config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
+ MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
}
}
}
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 52aca09..55bb30c 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -15,9 +15,11 @@
# specific language governing permissions and limitations
# under the License.
+from __future__ import print_function
import sys
import os
import time
+import multiprocessing as mp
import unittest
import mxnet as mx
import numpy as np
@@ -1478,6 +1480,42 @@ def test_cross_device_autograd():
assert_almost_equal(dx, x.grad.asnumpy())
+
+# The following 2 functions launch 0-thread kernels, an error that should be caught and signaled.
+def kernel_error_check_imperative():
+ os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
+ a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
+ b = mx.nd.array([],ctx=mx.gpu(0))
+ c = (a / b).asnumpy()
+
+def kernel_error_check_symbolic():
+ os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
+ a = mx.sym.Variable('a')
+ b = mx.sym.Variable('b')
+ c = a / b
+ f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
+ 'b':mx.nd.array([],ctx=mx.gpu(0))})
+ f.forward()
+ g = f.outputs[0].asnumpy()
+
+def test_kernel_error_checking():
+ # Running tests that may throw exceptions out of worker threads will stop CI testing
+ # if not run in a separate process (with its own address space for CUDA compatibility).
+ try:
+ mpctx = mp.get_context('spawn')
+ except:
+ print('SKIP: python%s.%s lacks the required process fork-exec support ... ' %
+ sys.version_info[0:2], file=sys.stderr, end='')
+ else:
+ with discard_stderr():
+ for f in [kernel_error_check_imperative, kernel_error_check_symbolic]:
+ p = mpctx.Process(target=f)
+ p.start()
+ p.join()
+ assert p.exitcode != 0,\
+ "Expected a synchronous kernel error from %s(), none seen." % f.__name__
+
+
if __name__ == '__main__':
import nose
nose.runmodule()
--
To stop receiving notification emails like this one, please contact
jxie@apache.org.