You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/09/18 20:43:18 UTC

[GitHub] [incubator-mxnet] ptrendx commented on a change in pull request #16039: FullyConnected Bias performance improvement on GPU

ptrendx commented on a change in pull request #16039: FullyConnected Bias performance improvement on GPU
URL: https://github.com/apache/incubator-mxnet/pull/16039#discussion_r325883132
 
 

 ##########
 File path: src/operator/nn/fully_connected-inl.h
 ##########
 @@ -122,10 +171,147 @@ void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
       << "Incomplete bias tensor detected: bias.data().shape[1] != weight.data().shape[0]."
          " This is not supported by FCForward. If bias is in row_sparse format, please"
          " make sure all row ids are present.";
-    out += repmat(bias, data.size(0));
+    AddBias(bias, data, out, s);
+  }
+}
+
+#if defined (__CUDACC__)
+
+template<typename LType, typename DType, typename AType>
+__global__ void AddBiasGradKernelPhase1(AType * temp_space, const DType* grad,
+                                        const size_t lead_dim, const size_t other_dim) {
+  constexpr int num_warps = 16;
+  constexpr int threads_per_warp = 32;
+  const int values_per_read = sizeof(LType) >= sizeof(DType) ? sizeof(LType) / sizeof(DType) : 1;
+  const size_t stride = lead_dim / values_per_read;
+  __shared__ AType scratch[threads_per_warp * num_warps  * values_per_read];
+  LType * my_scratch_load = &(reinterpret_cast<LType *>(scratch)[threadIdx.x]);
+  DType * my_values_load = reinterpret_cast<DType *>(my_scratch_load);
+  AType * my_values_acc = &(scratch[threadIdx.x * values_per_read]);
+  AType acc[values_per_read];  // NOLINT(*)
+#pragma unroll
+  for (int i = 0; i < values_per_read; ++i) {
+    acc[i] = 0;
+  }
+  const size_t offset = blockIdx.x * threads_per_warp;
+  const int my_warp = threadIdx.x / threads_per_warp;
+  const int my_id = threadIdx.x % threads_per_warp;
+  const LType* aligned_grad = reinterpret_cast<const LType*>(grad);
+  const int rows_per_block = (other_dim + gridDim.y - 1) / gridDim.y;
+  const size_t start_row = my_warp + rows_per_block * blockIdx.y;
+  const size_t end_row = min(other_dim, static_cast<size_t>(rows_per_block * (blockIdx.y + 1)));
+  if (offset + my_id < stride) {
+    for (size_t i = start_row; i < end_row; i += num_warps) {
+      *my_scratch_load = aligned_grad[i * stride + offset + my_id];
+#pragma unroll
+      for (int j = 0; j < values_per_read; ++j) {
+        acc[j] += static_cast<AType>(my_values_load[j]);
+      }
+    }
+  }
+  __syncthreads();
+#pragma unroll
+  for (int i = 0; i < values_per_read; ++i) {
+    my_values_acc[i] = acc[i];
+  }
+
+  __syncthreads();
+
+  for (int i = 8; i > 0; i /= 2) {
+    if (my_warp < i) {
+      const int shared_offset = values_per_read * i * threads_per_warp;
+#pragma unroll
+      for (int j = 0; j < values_per_read; ++j) {
+        my_values_acc[j] += my_values_acc[j + shared_offset];
+      }
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x < min(threads_per_warp * values_per_read,
+                        static_cast<int>(lead_dim - values_per_read * offset))) {
+    const size_t offset_out = values_per_read * offset +
+                              blockIdx.y * lead_dim;
+    temp_space[offset_out + threadIdx.x] = scratch[threadIdx.x];
+  }
+}
+
+template <typename DType, typename AType>
+__global__ void AddBiasGradKernelPhase2(const AType * temp_space, DType * out,
+                                        int lead_dim, int n_blocks, OpReqType req) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < lead_dim) {
+    AType acc = 0;
+    for (int i = tid; i < lead_dim * n_blocks; i += lead_dim) {
+      acc += temp_space[i];
+    }
+    KERNEL_ASSIGN(out[tid], req, static_cast<DType>(acc));
   }
 }
 
+template<typename DType>
+void AddBiasGrad(const TBlob& in_grad,
+                 Tensor<gpu, 2, DType> grad,
+                 OpReqType req,
+                 int num_hidden,
+                 const OpContext& ctx) {
+  if (req == kNullOp) return;
+  using AType = typename mxnet_op::AccType<DType>::type;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  Tensor<gpu, 1, DType> gbias = in_grad.get<gpu, 1, DType>(s);
+  TBlob grad_blob = TBlob(grad);
+  TBlob gbias_blob = TBlob(gbias);
+  mxnet::TShape x(1, 0);
+  mxnet::TShape small;
+  if (shape_assign(&gbias_blob.shape_, Shape2(num_hidden, 1))) {
+    small = gbias_blob.shape_;
+  } else {
+    small = ReduceAxesShapeImpl(grad_blob.shape_, dmlc::optional<mxnet::TShape>(x), true, false);
+  }
+  const int N = small.Size();
+  int ltype = mxnet::common::cuda::get_load_type(N * sizeof(DType));
+  const int M = grad_blob.shape_.Size() / N;
+  MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
+    const unsigned int blocks_x = (N * sizeof(DType) + 32 * sizeof(LType) - 1) /
+                                  (32 * sizeof(LType));
+    const unsigned int preferred_number_of_blocks = 2 *
+                                                    MultiprocessorCount(ctx.run_ctx.ctx.dev_id);
+    const unsigned int blocks_y = std::max(preferred_number_of_blocks / blocks_x, 1u);
+    const dim3 n_blocks = {blocks_x, blocks_y, 1};
+    auto scratch_space = ctx.requested[fullc::kTempSpace]
+                            .get_space_typed<gpu, 1, AType>(mshadow::Shape1(N * blocks_y), s);
+    auto stream = mshadow::Stream<gpu>::GetStream(s);
+    AddBiasGradKernelPhase1<LType><<<n_blocks, 512, 0, stream>>>(scratch_space.dptr_,
+                                                                 grad.dptr_, N, M);
+    AddBiasGradKernelPhase2<<<(N + 127) / 128, 128, 0, stream>>>(scratch_space.dptr_,
+                                                                 gbias.dptr_, N,
+                                                                 blocks_y, req);
+  });
+}
+#endif
+
 
 Review comment:
   ok

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services