You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/08/22 21:05:32 UTC

[GitHub] apeforest commented on a change in pull request #12261: Support fp16 in synchronized batchnorm

apeforest commented on a change in pull request #12261: Support fp16 in synchronized batchnorm
URL: https://github.com/apache/incubator-mxnet/pull/12261#discussion_r212105542
 
 

 ##########
 File path: src/operator/contrib/sync_batch_norm-inl.h
 ##########
 @@ -271,69 +273,102 @@ class SyncBatchNorm : public Operator {
     }
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const real_t scale = static_cast<real_t>(in_data[syncbatchnorm::kData].shape_[1]) /
-      static_cast<real_t>(in_data[syncbatchnorm::kData].shape_.Size());
-    Tensor<xpu, 4> data;
-    Tensor<xpu, 4> out;
-    if (in_data[syncbatchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[syncbatchnorm::kData].shape_[0],
-                               in_data[syncbatchnorm::kData].shape_[1], 1, 1);
-      data = in_data[syncbatchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[syncbatchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-    } else {
-      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
-      out = out_data[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
-    }
-    Tensor<xpu, 1> slope = in_data[syncbatchnorm::kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> bias = in_data[syncbatchnorm::kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_mean = aux_states[syncbatchnorm::kMovingMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_var = aux_states[syncbatchnorm::kMovingVar].get<xpu, 1, real_t>(s);
-
-    if (param_.fix_gamma) slope = 1.f;
-
-    // whether use global statistics
-    if (ctx.is_train && !param_.use_global_stats) {
-      // get my rank
-      Barrier *global_barrier = global_shared_barrier_forward.Register(param_.key, param_.ndev);
-      int myRank = global_shared_rank_forward.Register(param_.key, param_.ndev);
-      // get the mean and var
-      Tensor<xpu, 1> mean = out_data[syncbatchnorm::kMean].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 1> var = out_data[syncbatchnorm::kVar].get<xpu, 1, real_t>(s);
-      CHECK(req[syncbatchnorm::kMean] == kNullOp || req[syncbatchnorm::kMean] == kWriteTo);
-      CHECK(req[syncbatchnorm::kVar] == kNullOp || req[syncbatchnorm::kVar] == kWriteTo);
-      // E(x) and E(x^2)
-      mean = scale * sumall_except_dim<1>(data);
-      var = scale * sumall_except_dim<1>(F<mshadow_op::square>(data));
-      SharedND<mshadow::Tensor<cpu, 1, real_t>> *sharedMean =
-        global_shared_mean.Register(param_.key, param_.ndev);
-      SharedND<mshadow::Tensor<cpu, 1, real_t>> *sharedVar =
-        global_shared_var.Register(param_.key, param_.ndev);
-      // copy to cpu, push and pull
-      Tensor<cpu, 1, real_t>* mean_cpu_ptr = sharedMean->Retrieve(mean.shape_, myRank);
-      Tensor<cpu, 1, real_t>* var_cpu_ptr = sharedVar->Retrieve(mean.shape_, myRank);
-      mshadow::Copy(*mean_cpu_ptr, mean, s);
-      mshadow::Copy(*var_cpu_ptr, var, s);
-      sharedMean->SetReady(myRank);
-      sharedVar->SetReady(myRank);
-      global_barrier->Wait();
-      Tensor<cpu, 1, real_t> mean_cpu = sharedMean->Pop(myRank);
-      Tensor<cpu, 1, real_t> var_cpu = sharedVar->Pop(myRank);
-      // copy back to gpu
-      mshadow::Copy(mean, mean_cpu, s);
-      mshadow::Copy(var, var_cpu, s);
-
-      var = var-F<mshadow_op::square>(mean);
-      Assign(out, req[syncbatchnorm::kOut], broadcast<1>(slope, out.shape_) *
-             (data - broadcast<1>(mean, data.shape_)) /
-             F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-             broadcast<1>(bias, out.shape_));
-    } else {
-      Assign(out, req[syncbatchnorm::kOut], broadcast<1>(slope /
-                                          F<mshadow_op::square_root>(moving_var + param_.eps),
-                                          data.shape_) * data +
-             broadcast<1>(bias - (slope * moving_mean) /
-                          F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
-    }
+    MSHADOW_TYPE_SWITCH(in_data[syncbatchnorm::kData].type_flag_, DType, {
+      const bool is_double = std::is_same<DType, double>::value;
+      CHECK_EQ(is_double, false)
+        << "Synchronized BatchNorm does not support double-precision floating number yet...";
+      const real_t scale = static_cast<real_t>(in_data[syncbatchnorm::kData].shape_[1]) /
+        static_cast<real_t>(in_data[syncbatchnorm::kData].shape_.Size());
+      const size_t data_size = in_data[syncbatchnorm::kData].Size();
+      Tensor<xpu, 4> data;
+      Tensor<xpu, 4> out;
+      Tensor<xpu, 1> workspace;
+      if (!std::is_same<DType, real_t>::value) {
 
 Review comment:
   I think it's more accurate and clear to reader to write 
   ```
   if (std::is_same<DType, half_t>::value)
   ```
   here

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services