You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/08/22 21:09:59 UTC

[GitHub] haojin2 commented on a change in pull request #12261: Support fp16 in synchronized batchnorm

haojin2 commented on a change in pull request #12261: Support fp16 in synchronized batchnorm
URL: https://github.com/apache/incubator-mxnet/pull/12261#discussion_r212110460
 
 

 ##########
 File path: src/operator/contrib/sync_batch_norm-inl.h
 ##########
 @@ -271,69 +273,102 @@ class SyncBatchNorm : public Operator {
     }
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const real_t scale = static_cast<real_t>(in_data[syncbatchnorm::kData].shape_[1]) /
-      static_cast<real_t>(in_data[syncbatchnorm::kData].shape_.Size());
-    Tensor<xpu, 4> data;
-    Tensor<xpu, 4> out;
-    if (in_data[syncbatchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[syncbatchnorm::kData].shape_[0],
-                               in_data[syncbatchnorm::kData].shape_[1], 1, 1);
-      data = in_data[syncbatchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[syncbatchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-    } else {
-      data = in_data[syncbatchnorm::kData].get<xpu, 4, real_t>(s);
-      out = out_data[syncbatchnorm::kOut].get<xpu, 4, real_t>(s);
-    }
-    Tensor<xpu, 1> slope = in_data[syncbatchnorm::kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> bias = in_data[syncbatchnorm::kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_mean = aux_states[syncbatchnorm::kMovingMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_var = aux_states[syncbatchnorm::kMovingVar].get<xpu, 1, real_t>(s);
-
-    if (param_.fix_gamma) slope = 1.f;
-
-    // whether use global statistics
-    if (ctx.is_train && !param_.use_global_stats) {
-      // get my rank
-      Barrier *global_barrier = global_shared_barrier_forward.Register(param_.key, param_.ndev);
-      int myRank = global_shared_rank_forward.Register(param_.key, param_.ndev);
-      // get the mean and var
-      Tensor<xpu, 1> mean = out_data[syncbatchnorm::kMean].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 1> var = out_data[syncbatchnorm::kVar].get<xpu, 1, real_t>(s);
-      CHECK(req[syncbatchnorm::kMean] == kNullOp || req[syncbatchnorm::kMean] == kWriteTo);
-      CHECK(req[syncbatchnorm::kVar] == kNullOp || req[syncbatchnorm::kVar] == kWriteTo);
-      // E(x) and E(x^2)
-      mean = scale * sumall_except_dim<1>(data);
-      var = scale * sumall_except_dim<1>(F<mshadow_op::square>(data));
-      SharedND<mshadow::Tensor<cpu, 1, real_t>> *sharedMean =
-        global_shared_mean.Register(param_.key, param_.ndev);
-      SharedND<mshadow::Tensor<cpu, 1, real_t>> *sharedVar =
-        global_shared_var.Register(param_.key, param_.ndev);
-      // copy to cpu, push and pull
-      Tensor<cpu, 1, real_t>* mean_cpu_ptr = sharedMean->Retrieve(mean.shape_, myRank);
-      Tensor<cpu, 1, real_t>* var_cpu_ptr = sharedVar->Retrieve(mean.shape_, myRank);
-      mshadow::Copy(*mean_cpu_ptr, mean, s);
-      mshadow::Copy(*var_cpu_ptr, var, s);
-      sharedMean->SetReady(myRank);
-      sharedVar->SetReady(myRank);
-      global_barrier->Wait();
-      Tensor<cpu, 1, real_t> mean_cpu = sharedMean->Pop(myRank);
-      Tensor<cpu, 1, real_t> var_cpu = sharedVar->Pop(myRank);
-      // copy back to gpu
-      mshadow::Copy(mean, mean_cpu, s);
-      mshadow::Copy(var, var_cpu, s);
-
-      var = var-F<mshadow_op::square>(mean);
-      Assign(out, req[syncbatchnorm::kOut], broadcast<1>(slope, out.shape_) *
-             (data - broadcast<1>(mean, data.shape_)) /
-             F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-             broadcast<1>(bias, out.shape_));
-    } else {
-      Assign(out, req[syncbatchnorm::kOut], broadcast<1>(slope /
-                                          F<mshadow_op::square_root>(moving_var + param_.eps),
-                                          data.shape_) * data +
-             broadcast<1>(bias - (slope * moving_mean) /
-                          F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
-    }
+    MSHADOW_TYPE_SWITCH(in_data[syncbatchnorm::kData].type_flag_, DType, {
+      const bool is_double = std::is_same<DType, double>::value;
+      CHECK_EQ(is_double, false)
+        << "Synchronized BatchNorm does not support double-precision floating number yet...";
+      const real_t scale = static_cast<real_t>(in_data[syncbatchnorm::kData].shape_[1]) /
+        static_cast<real_t>(in_data[syncbatchnorm::kData].shape_.Size());
+      const size_t data_size = in_data[syncbatchnorm::kData].Size();
+      Tensor<xpu, 4> data;
+      Tensor<xpu, 4> out;
+      Tensor<xpu, 1> workspace;
+      if (!std::is_same<DType, real_t>::value) {
 
 Review comment:
   this is for an easier support for float64 in the future.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services