You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/11/14 22:29:07 UTC

[GitHub] eric-haibin-lin commented on a change in pull request #8647: sparse embedding operator, gpu implementation

eric-haibin-lin commented on a change in pull request #8647: sparse embedding operator, gpu implementation
URL: https://github.com/apache/incubator-mxnet/pull/8647#discussion_r150982447
 
 

 ##########
 File path: src/operator/tensor/indexing_op.cu
 ##########
 @@ -24,14 +24,170 @@
 */
 
 #include "./indexing_op.h"
+#include "./util/tensor_util-inl.cuh"
+
 namespace mxnet {
 namespace op {
+
+/*! \brief If there are out-of-bound indices, out will be assigned to 1.
+ */
+
+struct is_valid_check {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int32_t* out, const DType* data,
+                                  const DType min, const DType max) {
+    if (data[i] < min || data[i] > max) *out = 1;
+  }
+};
+
+
+struct AddTakeGradRspGPUKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* prefix_sum,
+                                             const IType* data,
+                                             const DType* ograd,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    const dim_t data_i = tid / row_length;
+    const dim_t grad_i = tid % row_length;
+    const dim_t irow = static_cast<dim_t>(data[data_i]);
+    const dim_t rsp_row = prefix_sum[irow] - 1;
+    const DType val = ograd[data_i * row_length + grad_i];
+    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
+  }
+};
+
+template<>
+void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
+                                          const TBlob& data,
+                                          const NDArray& weight,
+                                          const OpReqType req,
+                                          const TBlob& output) {
+  if (req == kNullOp) return;
+  using namespace rowsparse;
+  using namespace mxnet_op;
+  // zeros weight
+  if (req == kWriteTo && !weight.storage_initialized()) {
+    size_t out_size = output.shape_.Size();
+    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+      Kernel<set_zero, gpu>::Launch(s, out_size, output.dptr<DType>());
+    })
+    return;
+  }
+  // check out-of-bound indices
+  int32_t is_valid = 0;
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    DType min = 0;
+    DType max = static_cast<DType>(weight.shape()[0] - 1);
+    // check with single thread is faster since data is small
+    DType* data_ptr = data.dptr<DType>();
+    size_t data_size = data.shape_.Size();
+    int32_t* is_valid_ptr = NULL;
+    CUDA_CALL(cudaMalloc(&is_valid_ptr, sizeof(int32_t)));
+    Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+    Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
+    CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
+              cudaMemcpyDeviceToHost));
+  })
+  CHECK_EQ(is_valid, 0) << "SparseEmbedding input contains data out of bound";
+  // the weight is actually dense
+  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+    EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
+  } else {
+    EmbeddingOpForwardRspImpl<gpu>(s, data, weight, req, output);
+  }
+}
+
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  dim_t num_threads;
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        dim_t* prefix_sum = NULL;
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      mshadow::Stream<gpu>::GetStream(s));
+        mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
+                                           temp_storage_bytes), s);
+        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
+        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+        num_threads = num_rows;
+        Kernel<set_zero, gpu>::Launch(s, num_threads, prefix_sum);
 
 Review comment:
   Also use Fill instead?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services