You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ro...@apache.org on 2019/12/04 18:07:54 UTC
[incubator-mxnet] 01/01: Revert "migrate cudaMemcpy to
cudaMemcpyAsync+cudaStreamSynchronize (#16790)"
This is an automated email from the ASF dual-hosted git repository.
roywei pushed a commit to branch revert-16790-no_memcpy
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
commit 7e561a3800827121f3d367c3a8252bb228ce5a4b
Author: Lai Wei <ro...@gmail.com>
AuthorDate: Wed Dec 4 10:07:09 2019 -0800
Revert "migrate cudaMemcpy to cudaMemcpyAsync+cudaStreamSynchronize (#16790)"
This reverts commit 42d3182e5abd2ebbacb45027a08c793d30d46a50.
---
src/kvstore/kvstore_utils.cu | 9 +-
src/ndarray/ndarray_function.cu | 10 +--
src/operator/contrib/adamw-inl.h | 11 ++-
src/operator/contrib/adamw.cc | 2 +-
src/operator/contrib/adamw.cu | 8 +-
src/operator/contrib/boolean_mask.cu | 16 ++--
src/operator/contrib/index_array.cu | 19 ++--
src/operator/contrib/multi_proposal.cu | 110 +++++++++++-------------
src/operator/contrib/proposal.cu | 33 +++----
src/operator/numpy/np_boolean_mask_assign.cu | 11 +--
src/operator/numpy/np_nonzero_op.cu | 14 ++-
src/operator/numpy/np_unique_op.cu | 17 ++--
src/operator/numpy/random/dist_common.cc | 4 +-
src/operator/numpy/random/dist_common.cu | 14 +--
src/operator/numpy/random/dist_common.h | 4 +-
src/operator/numpy/random/np_bernoulli_op.h | 2 +-
src/operator/numpy/random/np_multinomial_op.cu | 8 +-
src/operator/numpy/random/np_multinomial_op.h | 4 +-
src/operator/numpy/random/np_normal_op.h | 4 +-
src/operator/tensor/cast_storage-inl.cuh | 8 +-
src/operator/tensor/dot-inl.cuh | 8 +-
src/operator/tensor/elemwise_binary_op_basic.cu | 5 +-
src/operator/tensor/indexing_op.cu | 15 ++--
src/operator/tensor/matrix_op.cu | 5 +-
src/operator/tensor/square_sum.cu | 4 +-
25 files changed, 145 insertions(+), 200 deletions(-)
diff --git a/src/kvstore/kvstore_utils.cu b/src/kvstore/kvstore_utils.cu
index 92b203c..2dab5bc 100644
--- a/src/kvstore/kvstore_utils.cu
+++ b/src/kvstore/kvstore_utils.cu
@@ -82,17 +82,16 @@ size_t UniqueImplGPU(NDArray *workspace, mshadow::Stream<gpu> *s,
#else
thrust::sort(thrust::cuda::par.on(stream),
dptr, dptr + size, thrust::greater<IType>());
- CUDA_CALL(cudaMemcpyAsync(sort_output_ptr, dptr, sort_output_bytes,
- cudaMemcpyDeviceToDevice, stream));
+ CUDA_CALL(cudaMemcpy(sort_output_ptr, dptr, sort_output_bytes,
+ cudaMemcpyDeviceToDevice));
#endif
// execute unique kernel
cub::DeviceSelect::Unique(temp_storage, unique_temp_bytes, sort_output_ptr, dptr,
num_selected_ptr, size, stream);
// retrieve num selected unique values
size_t num_selected_out = 0;
- CUDA_CALL(cudaMemcpyAsync(&num_selected_out, num_selected_ptr, num_selected_bytes,
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&num_selected_out, num_selected_ptr, num_selected_bytes,
+ cudaMemcpyDeviceToHost));
return num_selected_out;
}
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 79bc345..da7b60d 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -129,13 +129,12 @@ void ElementwiseSumRspImpl(mshadow::Stream<gpu>* s,
IType* row_flg = NULL;
void* d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
cub::DeviceScan::InclusiveSum(d_temp_storage,
temp_storage_bytes,
row_flg,
row_flg,
num_rows,
- stream);
+ mshadow::Stream<gpu>::GetStream(s));
mshadow::Tensor<gpu, 1, char> workspace = rsc
.get_space_typed<gpu, 1, char>(mshadow::Shape1(num_rows * sizeof(IType) +
temp_storage_bytes), s);
@@ -159,12 +158,11 @@ void ElementwiseSumRspImpl(mshadow::Stream<gpu>* s,
row_flg,
row_flg,
num_rows,
- stream);
+ mshadow::Stream<gpu>::GetStream(s));
// Get total number of output non-zero rows from GPU and allocate out data and row_idx
dim_t nnr_out = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr_out, &row_flg[num_rows-1], sizeof(dim_t),
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg[num_rows-1], sizeof(dim_t),
+ cudaMemcpyDeviceToHost));
out->CheckAndAlloc({mshadow::Shape1(nnr_out)});
IType* out_row_idx = out->aux_data(kIdx).dptr<IType>();
DType* out_data = out->data().dptr<DType>();
diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h
index 6f48333..fd139de 100644
--- a/src/operator/contrib/adamw-inl.h
+++ b/src/operator/contrib/adamw-inl.h
@@ -442,15 +442,14 @@ static inline void MultiAdamWUpdate(const nnvm::NodeAttrs& attrs,
}
template<typename xpu>
-void GetScaleFloat(mshadow::Stream<xpu> *s, const TBlob &scale_blob, float *pScalef);
+void GetScaleFloat(const TBlob &scale_blob, float *pScalef);
template<typename xpu>
-bool PrepareInputBlobs(const OpContext &ctx,
- const std::vector<TBlob> &inputs,
+bool PrepareInputBlobs(const std::vector<TBlob> &inputs,
std::vector<TBlob> *inputs_wo_scale,
float *pScalef) {
const size_t num_in = inputs.size() - 1;
- GetScaleFloat<xpu>(ctx.get_stream<xpu>(), inputs[num_in], pScalef);
+ GetScaleFloat<xpu>(inputs[num_in], pScalef);
if (!std::isfinite(*pScalef) || *pScalef == 0)
return false;
@@ -469,7 +468,7 @@ inline void MPUpdate(const nnvm::NodeAttrs& attrs,
const std::vector<TBlob> &outputs) {
std::vector<TBlob> inputs_wo_scale;
float scalef;
- if (!PrepareInputBlobs<xpu>(ctx, inputs, &inputs_wo_scale, &scalef))
+ if (!PrepareInputBlobs<xpu>(inputs, &inputs_wo_scale, &scalef))
return;
F::Forward(attrs, ctx, inputs_wo_scale, req, outputs, scalef);
@@ -483,7 +482,7 @@ inline void multiMPUpdate(const nnvm::NodeAttrs& attrs,
const std::vector<TBlob> &outputs) {
std::vector<TBlob> inputs_wo_scale;
float scalef;
- if (!PrepareInputBlobs<xpu>(ctx, inputs, &inputs_wo_scale, &scalef))
+ if (!PrepareInputBlobs<xpu>(inputs, &inputs_wo_scale, &scalef))
return;
if (!MP)
diff --git a/src/operator/contrib/adamw.cc b/src/operator/contrib/adamw.cc
index effae5c..2c730f0 100644
--- a/src/operator/contrib/adamw.cc
+++ b/src/operator/contrib/adamw.cc
@@ -119,7 +119,7 @@ the update is skipped.
.add_arguments(AdamWParam::__FIELDS__());
template<>
-void GetScaleFloat<cpu>(mshadow::Stream<cpu> *s, const TBlob &scale_blob, float *pScalef) {
+void GetScaleFloat<cpu>(const TBlob &scale_blob, float *pScalef) {
MSHADOW_REAL_TYPE_SWITCH(scale_blob.type_flag_, DType,
*pScalef = static_cast<float>(*scale_blob.dptr<DType>());
)
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index 2b0040e..81b13c9 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -29,13 +29,11 @@ namespace mxnet {
namespace op {
template<>
-void GetScaleFloat<gpu>(mshadow::Stream<gpu> *s, const TBlob &scale_blob, float *pScalef) {
+void GetScaleFloat<gpu>(const TBlob &scale_blob, float *pScalef) {
MSHADOW_REAL_TYPE_SWITCH(scale_blob.type_flag_, DType, {
DType scale = 0;
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- CUDA_CALL(cudaMemcpyAsync(&scale, scale_blob.dptr<DType>(), sizeof(DType),
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&scale, scale_blob.dptr<DType>(), sizeof(DType),
+ cudaMemcpyDeviceToHost));
*pScalef = static_cast<float>(scale);
})
}
diff --git a/src/operator/contrib/boolean_mask.cu b/src/operator/contrib/boolean_mask.cu
index 95f5614..a5ef4a7 100644
--- a/src/operator/contrib/boolean_mask.cu
+++ b/src/operator/contrib/boolean_mask.cu
@@ -46,7 +46,6 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
CHECK_EQ(data.shape()[axis], idx.shape()[0]);
CHECK_EQ(idx.shape().ndim(), 1U);
Stream<gpu>* s = ctx.get_stream<gpu>();
- cudaStream_t stream = Stream<gpu>::GetStream(s);
// count the number of 1s in `idx`, so that we could know the output dimension
size_t idx_size = idx.shape()[0];
int32_t valid_num = 0;
@@ -59,7 +58,7 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
idx_size,
- stream);
+ Stream<gpu>::GetStream(s));
size_t buffer_size = idx_size * sizeof(int32_t);
temp_storage_bytes += buffer_size;
// Allocate memory on GPU and allocate pointer
@@ -77,11 +76,9 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
idx_size,
- stream);
- CUDA_CALL(cudaMemcpyAsync(&valid_num, &prefix_sum[idx_size - 1], sizeof(int32_t),
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
-
+ Stream<gpu>::GetStream(s));
+ CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[idx_size - 1], sizeof(int32_t),
+ cudaMemcpyDeviceToHost));
// Set the output shape forcefully
mxnet::TShape data_shape = data.shape();
data_shape[axis] = valid_num;
@@ -113,7 +110,6 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
const NDArray& idx = inputs[2];
const NDArray& igrad_data = outputs[0];
Stream<gpu>* s = ctx.get_stream<gpu>();
- cudaStream_t stream = Stream<gpu>::GetStream(s);
// Count the number of 1s in `idx`, so that we could know the output dimension
size_t idx_size = idx.shape()[0];
int32_t* prefix_sum = nullptr;
@@ -125,7 +121,7 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
idx_size,
- stream);
+ Stream<gpu>::GetStream(s));
size_t buffer_size = idx_size * sizeof(int32_t);
temp_storage_bytes += buffer_size;
// Allocate memory on GPU and allocate pointer
@@ -143,7 +139,7 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
idx_size,
- stream);
+ Stream<gpu>::GetStream(s));
size_t input_size = igrad_data.shape().Size();
size_t col_size = input_size / idx_size;
// Backward pass
diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu
index dae61ca..ddba6a8 100644
--- a/src/operator/contrib/index_array.cu
+++ b/src/operator/contrib/index_array.cu
@@ -41,8 +41,7 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs &attrs,
const TShape inshape = in_data.shape_;
const int ndim = inshape.ndim();
- Stream<gpu> *s = ctx.get_stream<gpu>();
- cudaStream_t stream = Stream<gpu>::GetStream(s);
+ Stream<gpu> *stream = ctx.get_stream<gpu>();
using namespace mxnet_op;
@@ -56,24 +55,24 @@ void IndexArrayForwardGPU(const nnvm::NodeAttrs &attrs,
IndexArrayBuildSelectedAxesWorkspace(axes, index_products, cpu_workspace.data(), ndim);
Tensor<gpu, 1, int64_t> workspace =
- ctx.requested[0].get_space_typed<gpu, 1, int64_t>(Shape1(2 * naxes), s);
+ ctx.requested[0].get_space_typed<gpu, 1, int64_t>(Shape1(2 * naxes), stream);
- CUDA_CALL(cudaMemcpyAsync(workspace.dptr_, cpu_workspace.data(), sizeof(int64_t) * (2 * naxes),
- cudaMemcpyHostToDevice, stream));
+ CUDA_CALL(cudaMemcpy(workspace.dptr_, cpu_workspace.data(), sizeof(int64_t) * (2 * naxes),
+ cudaMemcpyHostToDevice));
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
- Kernel<IndexArrayKernel<req_type>, gpu>::Launch(s, in_data.Size(),
+ Kernel<IndexArrayKernel<req_type>, gpu>::Launch(stream, in_data.Size(),
out_data.dptr<int64_t>(), naxes, workspace.dptr_);
});
} else {
Tensor<gpu, 1, dim_t> workspace =
- ctx.requested[0].get_space_typed<gpu, 1, dim_t>(Shape1(ndim), s);
+ ctx.requested[0].get_space_typed<gpu, 1, dim_t>(Shape1(ndim), stream);
- CUDA_CALL(cudaMemcpyAsync(workspace.dptr_, inshape.data(), sizeof(dim_t) * ndim,
- cudaMemcpyHostToDevice, stream));
+ CUDA_CALL(cudaMemcpy(workspace.dptr_, inshape.data(), sizeof(dim_t) * ndim,
+ cudaMemcpyHostToDevice));
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
- Kernel<IndexArrayDefaultKernel<req_type>, gpu>::Launch(s, in_data.Size(),
+ Kernel<IndexArrayDefaultKernel<req_type>, gpu>::Launch(stream, in_data.Size(),
out_data.dptr<int64_t>(), ndim, workspace.dptr_);
});
}
diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
index 1aa852a..4552ae4 100644
--- a/src/operator/contrib/multi_proposal.cu
+++ b/src/operator/contrib/multi_proposal.cu
@@ -324,8 +324,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
}
}
-void _nms(mshadow::Stream<gpu> *s,
- const mshadow::Tensor<gpu, 2>& boxes,
+void _nms(const mshadow::Tensor<gpu, 2>& boxes,
const float nms_overlap_thresh,
const int rpn_post_nms_top_n,
int *keep,
@@ -350,13 +349,10 @@ void _nms(mshadow::Stream<gpu> *s,
mask_dev);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
std::vector<uint64_t> mask_host(boxes_num * col_blocks);
-
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(&mask_host[0],
- mask_dev,
- sizeof(uint64_t) * boxes_num * col_blocks,
- cudaMemcpyDeviceToHost, stream));
- FRCNN_CUDA_CHECK(cudaStreamSynchronize(stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(uint64_t) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
@@ -480,12 +476,8 @@ class MultiProposalGPUOp : public Operator{
sizeof(float) * num_images * count_anchors * 5));
Tensor<xpu, 3> workspace_proposals(workspace_proposals_ptr,
Shape3(num_images, count_anchors, 5));
-
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(workspace_proposals.dptr_, &anchors[0],
- sizeof(float) * anchors.size(),
- cudaMemcpyHostToDevice, stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0],
+ sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
// Copy proposals to a mesh grid
dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock);
@@ -537,50 +529,50 @@ class MultiProposalGPUOp : public Operator{
FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n));
for (int b = 0; b < num_images; b++) {
- CheckLaunchParam(dimGrid, dimBlock, "CopyScore");
- CopyScoreKernel << <dimGrid, dimBlock >> >(
- count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
- score.dptr_, order.dptr_);
- FRCNN_CUDA_CHECK(cudaPeekAtLastError());
-
- // argsort score, save order
- thrust::stable_sort_by_key(thrust::device,
- score.dptr_,
- score.dptr_ + score.size(0),
- order.dptr_,
- thrust::greater<real_t>());
- FRCNN_CUDA_CHECK(cudaPeekAtLastError());
-
- // Reorder proposals according to order
-
- dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
- CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
- ReorderProposalsKernel << <dimGrid, dimBlock >> >(
- rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
- order.dptr_, workspace_ordered_proposals.dptr_);
- FRCNN_CUDA_CHECK(cudaPeekAtLastError());
-
- // perform nms
- std::vector<int> _keep(workspace_ordered_proposals.size(0));
- int out_size = 0;
- _nms(s, workspace_ordered_proposals,
- param_.threshold,
- rpn_post_nms_top_n,
- &_keep[0],
- &out_size);
-
- // copy nms result to gpu
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(keep, &_keep[0], sizeof(int) * _keep.size(),
- cudaMemcpyHostToDevice, stream));
-
- // copy results after nms
- dimGrid.x = (param_.rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
- CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput");
- PrepareOutput << <dimGrid, dimBlock >> >(
- param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
- out.dptr_ + b * param_.rpn_post_nms_top_n * 5,
- out_score.dptr_ + b * param_.rpn_post_nms_top_n);
- FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+ CheckLaunchParam(dimGrid, dimBlock, "CopyScore");
+ CopyScoreKernel << <dimGrid, dimBlock >> >(
+ count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
+ score.dptr_, order.dptr_);
+ FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+ // argsort score, save order
+ thrust::stable_sort_by_key(thrust::device,
+ score.dptr_,
+ score.dptr_ + score.size(0),
+ order.dptr_,
+ thrust::greater<real_t>());
+ FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+ // Reorder proposals according to order
+
+ dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+ CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
+ ReorderProposalsKernel << <dimGrid, dimBlock >> >(
+ rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
+ order.dptr_, workspace_ordered_proposals.dptr_);
+ FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+ // perform nms
+ std::vector<int> _keep(workspace_ordered_proposals.size(0));
+ int out_size = 0;
+ _nms(workspace_ordered_proposals,
+ param_.threshold,
+ rpn_post_nms_top_n,
+ &_keep[0],
+ &out_size);
+
+ // copy nms result to gpu
+ FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
+ cudaMemcpyHostToDevice));
+
+ // copy results after nms
+ dimGrid.x = (param_.rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+ CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput");
+ PrepareOutput << <dimGrid, dimBlock >> >(
+ param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
+ out.dptr_ + b * param_.rpn_post_nms_top_n * 5,
+ out_score.dptr_ + b * param_.rpn_post_nms_top_n);
+ FRCNN_CUDA_CHECK(cudaPeekAtLastError());
}
// free temporary memory
FRCNN_CUDA_CHECK(cudaFree(keep));
diff --git a/src/operator/contrib/proposal.cu b/src/operator/contrib/proposal.cu
index b107dfa..446c92b 100644
--- a/src/operator/contrib/proposal.cu
+++ b/src/operator/contrib/proposal.cu
@@ -305,8 +305,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
}
}
-void _nms(mshadow::Stream<gpu> *s,
- const mshadow::Tensor<gpu, 2>& boxes,
+void _nms(const mshadow::Tensor<gpu, 2>& boxes,
const float nms_overlap_thresh,
const int rpn_post_nms_top_n,
int *keep,
@@ -331,12 +330,10 @@ void _nms(mshadow::Stream<gpu> *s,
mask_dev);
FRCNN_CUDA_CHECK(cudaPeekAtLastError());
std::vector<uint64_t> mask_host(boxes_num * col_blocks);
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(&mask_host[0],
- mask_dev,
- sizeof(uint64_t) * boxes_num * col_blocks,
- cudaMemcpyDeviceToHost, stream));
- FRCNN_CUDA_CHECK(cudaStreamSynchronize(stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(uint64_t) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
@@ -459,10 +456,9 @@ class ProposalGPUOp : public Operator{
float* workspace_proposals_ptr = NULL;
FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * count * 5));
Tensor<xpu, 2> workspace_proposals(workspace_proposals_ptr, Shape2(count, 5));
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(workspace_proposals.dptr_,
- &anchors[0], sizeof(float) * anchors.size(),
- cudaMemcpyHostToDevice, stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_,
+ &anchors[0], sizeof(float) * anchors.size(),
+ cudaMemcpyHostToDevice));
// Copy proposals to a mesh grid
dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock);
@@ -475,10 +471,9 @@ class ProposalGPUOp : public Operator{
// im_info is small, we want to copy them to cpu
std::vector<float> cpu_im_info(3);
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(&cpu_im_info[0], im_info.dptr_,
- sizeof(float) * cpu_im_info.size(),
- cudaMemcpyDeviceToHost, stream));
- FRCNN_CUDA_CHECK(cudaStreamSynchronize(stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(&cpu_im_info[0], im_info.dptr_,
+ sizeof(float) * cpu_im_info.size(),
+ cudaMemcpyDeviceToHost));
// prevent padded predictions
int real_height = static_cast<int>(cpu_im_info[0] / param_.feature_stride);
@@ -548,7 +543,7 @@ class ProposalGPUOp : public Operator{
// perform nms
std::vector<int> _keep(workspace_ordered_proposals.size(0));
int out_size = 0;
- _nms(s, workspace_ordered_proposals,
+ _nms(workspace_ordered_proposals,
param_.threshold,
rpn_post_nms_top_n,
&_keep[0],
@@ -557,8 +552,8 @@ class ProposalGPUOp : public Operator{
// copy nms result to gpu
int* keep;
FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * _keep.size()));
- FRCNN_CUDA_CHECK(cudaMemcpyAsync(keep, &_keep[0], sizeof(int) * _keep.size(),
- cudaMemcpyHostToDevice, stream));
+ FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
+ cudaMemcpyHostToDevice));
// copy results after nms
dimGrid.x = (param_.rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu
index 2ccc4ff..e3b0330 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cu
+++ b/src/operator/numpy/np_boolean_mask_assign.cu
@@ -113,7 +113,6 @@ size_t* GetValidNumGPU(const OpContext &ctx, const DType *idx, const size_t idx_
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
Stream<gpu>* s = ctx.get_stream<gpu>();
- cudaStream_t stream = Stream<gpu>::GetStream(s);
// Calculate total temporary memory size
cub::DeviceScan::ExclusiveSum(d_temp_storage,
@@ -121,7 +120,7 @@ size_t* GetValidNumGPU(const OpContext &ctx, const DType *idx, const size_t idx_
prefix_sum,
prefix_sum,
idx_size + 1,
- stream);
+ Stream<gpu>::GetStream(s));
size_t buffer_size = (idx_size + 1) * sizeof(size_t);
temp_storage_bytes += buffer_size;
// Allocate memory on GPU and allocate pointer
@@ -145,7 +144,7 @@ size_t* GetValidNumGPU(const OpContext &ctx, const DType *idx, const size_t idx_
prefix_sum,
prefix_sum,
idx_size + 1,
- stream);
+ Stream<gpu>::GetStream(s));
return prefix_sum;
}
@@ -175,10 +174,8 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs,
MSHADOW_TYPE_SWITCH(mask.type_flag_, MType, {
prefix_sum = GetValidNumGPU<MType>(ctx, mask.dptr<MType>(), mask_size);
});
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- CUDA_CALL(cudaMemcpyAsync(&valid_num, &prefix_sum[mask_size], sizeof(size_t),
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[mask_size], sizeof(size_t),
+ cudaMemcpyDeviceToHost));
}
// If there's no True in mask, return directly
if (valid_num == 0) return;
diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu
index a31222e..c732d2c 100644
--- a/src/operator/numpy/np_nonzero_op.cu
+++ b/src/operator/numpy/np_nonzero_op.cu
@@ -63,7 +63,6 @@ void NonzeroForwardGPU(const nnvm::NodeAttrs& attrs,
}
int32_t valid_num = 0;
Stream<gpu>* stream = ctx.get_stream<gpu>();
- cudaStream_t cuda_stream = Stream<gpu>::GetStream(stream);
int32_t* prefix_sum = nullptr;
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
@@ -73,7 +72,7 @@ void NonzeroForwardGPU(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
in_size,
- cuda_stream);
+ Stream<gpu>::GetStream(stream));
size_t buffer_size = in_size * sizeof(int32_t);
temp_storage_bytes += buffer_size;
// Allocate memory on GPU and allocate pointer
@@ -91,18 +90,17 @@ void NonzeroForwardGPU(const nnvm::NodeAttrs& attrs,
prefix_sum,
prefix_sum,
in_size,
- cuda_stream);
- CUDA_CALL(cudaMemcpyAsync(&valid_num, &prefix_sum[in_size - 1], sizeof(int32_t),
- cudaMemcpyDeviceToHost, cuda_stream));
- CUDA_CALL(cudaStreamSynchronize(cuda_stream));
+ Stream<gpu>::GetStream(stream));
+ CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[in_size - 1], sizeof(int32_t),
+ cudaMemcpyDeviceToHost));
// 0-dim
if (0 == in.shape().ndim()) {
mxnet::TShape s(2, 1);
if (valid_num) {
const_cast<NDArray &>(out).Init(s);
int64_t temp = 0;
- CUDA_CALL(cudaMemcpyAsync(out.data().dptr<int64_t>(), &temp, sizeof(int64_t),
- cudaMemcpyHostToDevice, cuda_stream));
+ CUDA_CALL(cudaMemcpy(out.data().dptr<int64_t>(), &temp, sizeof(int64_t),
+ cudaMemcpyHostToDevice));
} else {
s[0] = 0;
const_cast<NDArray &>(out).Init(s);
diff --git a/src/operator/numpy/np_unique_op.cu b/src/operator/numpy/np_unique_op.cu
index 22fd1d1..4d90a45 100644
--- a/src/operator/numpy/np_unique_op.cu
+++ b/src/operator/numpy/np_unique_op.cu
@@ -97,7 +97,6 @@ void NumpyUniqueGPUNoneAxisImpl(const NumpyUniqueParam& param,
const std::vector<NDArray> &outputs) {
MXNET_NO_FLOAT16_TYPE_SWITCH(outputs[0].dtype(), DType, {
mshadow::Stream<gpu> *stream = ctx.get_stream<gpu>();
- cudaStream_t cuda_stream = mshadow::Stream<gpu>::GetStream(stream);
auto policy = thrust::cuda::par.on(stream->stream_);
DType* input_data = inputs[0].data().dptr<DType>();
@@ -121,9 +120,8 @@ void NumpyUniqueGPUNoneAxisImpl(const NumpyUniqueParam& param,
thrust::device_vector<int32_t> prefix_sum(input_size, 0);
thrust::inclusive_scan(policy, mask.begin(), mask.end(), prefix_sum.begin());
int32_t valid_num = 0;
- CUDA_CALL(cudaMemcpyAsync(&valid_num, thrust::raw_pointer_cast(&prefix_sum[input_size - 1]),
- sizeof(int32_t), cudaMemcpyDeviceToHost, cuda_stream));
- CUDA_CALL(cudaStreamSynchronize(cuda_stream));
+ CUDA_CALL(cudaMemcpy(&valid_num, thrust::raw_pointer_cast(&prefix_sum[input_size - 1]),
+ sizeof(int32_t), cudaMemcpyDeviceToHost));
// set the output shape forcefully
mxnet::TShape s(1, valid_num);
const_cast<NDArray &>(outputs[0]).Init(s);
@@ -182,7 +180,6 @@ void NumpyUniqueGPUImpl(const NumpyUniqueParam& param,
using namespace mshadow;
using namespace mshadow::expr;
Stream<gpu> *stream = ctx.get_stream<gpu>();
- cudaStream_t cuda_stream = Stream<gpu>::GetStream(stream);
auto policy = thrust::cuda::par.on(stream->stream_);
const index_t actual_axis =
param.axis.value() + ((param.axis.value() < 0) ? inputs[0].shape().ndim() : 0);
@@ -217,9 +214,8 @@ void NumpyUniqueGPUImpl(const NumpyUniqueParam& param,
thrust::device_vector<int32_t> prefix_sum(temp_shape[0], 0);
thrust::inclusive_scan(policy, mask.begin(), mask.end(), prefix_sum.begin());
int32_t valid_num = 0;
- CUDA_CALL(cudaMemcpyAsync(&valid_num, thrust::raw_pointer_cast(&prefix_sum[temp_shape[0] - 1]),
- sizeof(int32_t), cudaMemcpyDeviceToHost, cuda_stream));
- CUDA_CALL(cudaStreamSynchronize(cuda_stream));
+ CUDA_CALL(cudaMemcpy(&valid_num, thrust::raw_pointer_cast(&prefix_sum[temp_shape[0] - 1]),
+ sizeof(int32_t), cudaMemcpyDeviceToHost));
// store the temp output data, reuse the space of 'input_tensor'
Tensor<gpu, 3, DType> temp_tensor(workspace.dptr_,
Shape3(valid_num, temp_shape[1], temp_shape[2]), stream);
@@ -286,12 +282,11 @@ void NumpyUniqueGPUForward(const nnvm::NodeAttrs& attrs,
CHECK(!param.axis.has_value() || param.axis.value() == -1 || param.axis.value() == 0)
<< "Axis can only be -1 or 0 for scalor tensor";
Stream<gpu> *s = ctx.get_stream<gpu>();
- cudaStream_t stream = Stream<gpu>::GetStream(s);
mxnet::TShape shape_1(1, 1);
const_cast<NDArray &>(outputs[0]).Init(shape_1);
MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
- CUDA_CALL(cudaMemcpyAsync(outputs[0].data().dptr<DType>(), inputs[0].data().dptr<DType>(),
- sizeof(DType), cudaMemcpyDeviceToDevice, stream));
+ CUDA_CALL(cudaMemcpy(outputs[0].data().dptr<DType>(), inputs[0].data().dptr<DType>(),
+ sizeof(DType), cudaMemcpyDeviceToDevice));
});
int output_flag = 0;
if (param.return_index) {
diff --git a/src/operator/numpy/random/dist_common.cc b/src/operator/numpy/random/dist_common.cc
index 18a2085..9255656 100644
--- a/src/operator/numpy/random/dist_common.cc
+++ b/src/operator/numpy/random/dist_common.cc
@@ -30,12 +30,12 @@ namespace mxnet {
namespace op {
template <>
-void _copy<cpu>(mshadow::Stream<cpu> *s, float *dst, float *src) {
+void _copy<cpu>(float *dst, float *src) {
*dst = *src;
}
template <>
-void _copy<cpu>(mshadow::Stream<cpu> *s, double *dst, double *src) {
+void _copy<cpu>(double *dst, double *src) {
*dst = *src;
}
diff --git a/src/operator/numpy/random/dist_common.cu b/src/operator/numpy/random/dist_common.cu
index dbd313b..7dde012 100644
--- a/src/operator/numpy/random/dist_common.cu
+++ b/src/operator/numpy/random/dist_common.cu
@@ -30,19 +30,13 @@ namespace mxnet {
namespace op {
template <>
-void _copy<gpu>(mshadow::Stream<gpu> *s, float *dst, float *src) {
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- CUDA_CALL(cudaMemcpyAsync(dst, src, sizeof(float), cudaMemcpyDeviceToHost,
- stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+void _copy<gpu>(float *dst, float *src) {
+CUDA_CALL(cudaMemcpy(dst, src, sizeof(float), cudaMemcpyDeviceToHost));
}
template <>
-void _copy<gpu>(mshadow::Stream<gpu> *s, double *dst, double *src) {
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- CUDA_CALL(cudaMemcpyAsync(dst, src, sizeof(double), cudaMemcpyDeviceToHost,
- stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+void _copy<gpu>(double *dst, double *src) {
+CUDA_CALL(cudaMemcpy(dst, src, sizeof(double), cudaMemcpyDeviceToHost));
}
} // namespace op
diff --git a/src/operator/numpy/random/dist_common.h b/src/operator/numpy/random/dist_common.h
index e835829..aafd10e 100644
--- a/src/operator/numpy/random/dist_common.h
+++ b/src/operator/numpy/random/dist_common.h
@@ -41,10 +41,10 @@ namespace mxnet {
namespace op {
template <typename xpu>
-void _copy(mshadow::Stream<xpu> *s, float *dst, float*src);
+void _copy(float *dst, float*src);
template <typename xpu>
-void _copy(mshadow::Stream<xpu> *s, double *dst, double*src);
+void _copy(double *dst, double*src);
inline int FillShape(const mxnet::TShape &lshape, const mxnet::TShape &rshape,
diff --git a/src/operator/numpy/random/np_bernoulli_op.h b/src/operator/numpy/random/np_bernoulli_op.h
index 0df1089..aa8e344 100644
--- a/src/operator/numpy/random/np_bernoulli_op.h
+++ b/src/operator/numpy/random/np_bernoulli_op.h
@@ -173,7 +173,7 @@ void NumpyBernoulliForward(const nnvm::NodeAttrs &attrs,
Kernel<check_legal_prob_kernel<IType>, xpu>::Launch(
s, inputs[0].Size(), inputs[0].dptr<IType>(), indicator_device_ptr);
});
- _copy<xpu>(s, &indicator_host, indicator_device_ptr);
+ _copy<xpu>(&indicator_host, indicator_device_ptr);
CHECK_GE(indicator_host, 0.0)
<< "ValueError: expect probs >= 0 && probs <= 1";
}
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
index 132d67b..6aa1639 100644
--- a/src/operator/numpy/random/np_multinomial_op.cu
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -28,12 +28,10 @@ namespace mxnet {
namespace op {
template<typename DType>
-void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) {
+void CheckPvalGPU(DType* input, int prob_length) {
std::vector<DType> pvals_(prob_length);
- cudaStream_t stream = mshadow::Stream<gpu>::GetStream(ctx.get_stream<gpu>());
- CUDA_CALL(cudaMemcpyAsync(&pvals_[0], input, sizeof(DType) * prob_length,
- cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&pvals_[0], input, sizeof(DType) * prob_length,
+ cudaMemcpyDeviceToHost));
DType sum = DType(0.0);
for (int i = 0; i < prob_length; ++i) {
sum += pvals_[i];
diff --git a/src/operator/numpy/random/np_multinomial_op.h b/src/operator/numpy/random/np_multinomial_op.h
index 9c5c73f..2350d20 100644
--- a/src/operator/numpy/random/np_multinomial_op.h
+++ b/src/operator/numpy/random/np_multinomial_op.h
@@ -100,7 +100,7 @@ inline bool NumpyMultinomialOpType(const nnvm::NodeAttrs& attrs,
}
template<typename DType>
-void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length);
+void CheckPvalGPU(DType* input, int prob_length);
template<typename DType>
void CheckPval(DType* input, int prob_length) {
@@ -188,7 +188,7 @@ void NumpyMultinomialForward(const nnvm::NodeAttrs& attrs,
if (std::is_same<xpu, cpu>::value) {
CheckPval<DType>(inputs[0].dptr<DType>(), prob_length);
} else {
- CheckPvalGPU<DType>(ctx, inputs[0].dptr<DType>(), prob_length);
+ CheckPvalGPU<DType>(inputs[0].dptr<DType>(), prob_length);
}
Kernel<multinomial_kernel, xpu>::Launch(
s, num_output, num_exp, prob_length,
diff --git a/src/operator/numpy/random/np_normal_op.h b/src/operator/numpy/random/np_normal_op.h
index 8cc4288..c74151f 100644
--- a/src/operator/numpy/random/np_normal_op.h
+++ b/src/operator/numpy/random/np_normal_op.h
@@ -181,7 +181,7 @@ void NumpyNormalForward(const nnvm::NodeAttrs &attrs,
Kernel<check_legal_scale_kernel<IType>, xpu>::Launch(
s, inputs[0].Size(), inputs[0].dptr<IType>(), indicator_device_ptr);
});
- _copy<xpu>(s, &indicator_host, indicator_device_ptr);
+ _copy<xpu>(&indicator_host, indicator_device_ptr);
CHECK_GE(indicator_host, 0.0) << "ValueError: scale < 0";
} else {
scalar_pos = 1;
@@ -206,7 +206,7 @@ void NumpyNormalForward(const nnvm::NodeAttrs &attrs,
Kernel<check_legal_scale_kernel<IType>, xpu>::Launch(
s, inputs[1].Size(), inputs[1].dptr<IType>(), indicator_device_ptr);
});
- _copy<xpu>(s, &indicator_host, indicator_device_ptr);
+ _copy<xpu>(&indicator_host, indicator_device_ptr);
CHECK_GE(indicator_host, 0.0) << "ValueError: scale < 0";
int ndim = FillShape(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
&new_lshape, &new_hshape, &new_oshape);
diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
index 4c5d0d8..ee1531d 100644
--- a/src/operator/tensor/cast_storage-inl.cuh
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -162,9 +162,7 @@ void CastStorageDnsRspGPUImpl_(const OpContext& ctx,
// Get total number of non-zero rows from device
dim_t nnr = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr, &row_flg[num_rows - 1], sizeof(dim_t),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows - 1], sizeof(dim_t), cudaMemcpyDeviceToHost));
// Allocate rsp tensor row index array and fill
rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr));
@@ -557,9 +555,7 @@ inline void CastStorageDnsCsrImpl(const OpContext& ctx,
// Receive total number of nnz values from device
IType nnz = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost,
- mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
// Allocate column index array and data array of the csr matrix
csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<dim_t>(nnz)));
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
index b8244d3..d6fed4a 100644
--- a/src/operator/tensor/dot-inl.cuh
+++ b/src/operator/tensor/dot-inl.cuh
@@ -702,8 +702,7 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
nnr_ptr, nnz, stream);
// retrieve num non-zero rows
size_t nnr = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr, nnr_ptr, nnr_bytes, cudaMemcpyDeviceToHost, stream));
- CUDA_CALL(cudaStreamSynchronize(stream));
+ CUDA_CALL(cudaMemcpy(&nnr, nnr_ptr, nnr_bytes, cudaMemcpyDeviceToHost));
// allocate data
ret->CheckAndAllocData(mshadow::Shape2(nnz, num_cols_r));
// generate lookup table
@@ -818,9 +817,8 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
num_cols_l,
mshadow::Stream<gpu>::GetStream(s));
dim_t nnr_out = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
+ cudaMemcpyDeviceToHost));
if (0 == nnr_out) {
FillZerosRspImpl(s, *ret);
return;
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index e39f7e9..f88b8eb 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -115,9 +115,8 @@ void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<gpu> *s,
num_rows,
mshadow::Stream<gpu>::GetStream(s));
nnvm::dim_t nnr_out = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr_out, &common_row_table[num_rows-1], sizeof(nnvm::dim_t),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)))
+ CUDA_CALL(cudaMemcpy(&nnr_out, &common_row_table[num_rows-1], sizeof(nnvm::dim_t),
+ cudaMemcpyDeviceToHost));
output.CheckAndAlloc({mshadow::Shape1(nnr_out)});
Kernel<FillRspRowIdxKernel, gpu>::Launch(
s, num_rows, output.aux_data(kIdx).dptr<IType>(), common_row_table, num_rows);
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 8250efb..3ccf1f3 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -154,9 +154,8 @@ bool CheckIndexOutOfBound(mshadow::Stream<gpu> *s, const DType* data_ptr, size_t
int32_t is_valid = 0;
Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
- CUDA_CALL(cudaMemcpyAsync(&is_valid, is_valid_ptr, sizeof(char),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(char),
+ cudaMemcpyDeviceToHost));
return is_valid == 0;
}
@@ -308,9 +307,8 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
grad_row_idx, grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
dim_t nnr = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr, grad_row_idx + data_size, sizeof(RType),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
+ cudaMemcpyDeviceToHost));
CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
output.set_aux_shape(kIdx, Shape1(nnr));
@@ -412,9 +410,8 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const bool deterministic,
num_rows,
mshadow::Stream<gpu>::GetStream(s));
dim_t nnr = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
+ cudaMemcpyDeviceToHost));
if (nnr == 0) {
FillZerosRspImpl(s, output);
return;
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index 239e42c..b382c55 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -114,9 +114,8 @@ void SliceDimTwoCsrImpl<gpu>(const mxnet::TShape &begin, const mxnet::TShape &en
Stream<gpu>::GetStream(s));
// retrieve nnr
RType nnr = 0;
- CUDA_CALL(cudaMemcpyAsync(&nnr, &out_indptr[indptr_len-1], sizeof(RType),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&nnr, &out_indptr[indptr_len-1], sizeof(RType),
+ cudaMemcpyDeviceToHost));
// returns zeros in csr format if nnr = 0
if (nnr == 0) {
diff --git a/src/operator/tensor/square_sum.cu b/src/operator/tensor/square_sum.cu
index 83287e0..0b40786 100644
--- a/src/operator/tensor/square_sum.cu
+++ b/src/operator/tensor/square_sum.cu
@@ -42,9 +42,7 @@ void CheckSameIdx<gpu>(const OpContext& ctx,
mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, 1, is_diff_ptr);
mxnet_op::Kernel<CheckSameIdxKernel, gpu>::Launch(s, idx_size,
ograd_idx, in_idx, is_diff_ptr);
- CUDA_CALL(cudaMemcpyAsync(&is_diff, is_diff_ptr, sizeof(int32_t),
- cudaMemcpyDeviceToHost, mshadow::Stream<gpu>::GetStream(s)));
- CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
+ CUDA_CALL(cudaMemcpy(&is_diff, is_diff_ptr, sizeof(int32_t), cudaMemcpyDeviceToHost));
CHECK_EQ(is_diff, 0) << "SquareSumRspGradImpl only supports"
" equal ograd_row_idx and input_row_idx"
" when ograd and input are both"