You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/22 00:24:02 UTC

[GitHub] szha closed pull request #11333: Fix issue of test_gru_bidirectional #11219 and add robust code

szha closed pull request #11333: Fix issue of test_gru_bidirectional #11219 and add robust code
URL: https://github.com/apache/incubator-mxnet/pull/11333
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index e92a18218f9..fa8d671a200 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -142,6 +142,7 @@ void LstmForwardTraining(DType* ws,
   const int y_offset = T * N * H * 5;
   const int cell_size = N * H;
   int idx = 0;  // state & cell state's idx;
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < L; ++i) {
     const int input_size = i ? H * D : I;
     const int w_size = (input_size + H) * H * 4;
@@ -172,7 +173,10 @@ void LstmForwardTraining(DType* ws,
       }
     }
   }
-  memcpy(y_ptr, rs + y_offset, T * N * H * D * sizeof(DType));
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < T * N * H * D; ++i) {
+    y_ptr[i] = (rs + y_offset)[i];
+  }
 }
 
 template<typename DType>
@@ -334,9 +338,18 @@ void LstmBackwardSingleLayer(DType* ws,
   DType *c_ptr = bid ? rs + T * N * H * 7 : rs;
   const Tensor<cpu, 3, DType> c(c_ptr, Shape3(T, N, H));
   const Tensor<cpu, 4, DType> ifgo(c_ptr + T * N * H, Shape4(T, N, H, 4));
-  memset(dwh.dptr_, 0, H * H * 4 * sizeof(DType));
-  memset(dbx.dptr_, 0, H * 4 * sizeof(DType));
-  memset(dbh.dptr_, 0, H * 4 * sizeof(DType));
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  if (req_params != kNullOp && req_params != kAddTo) {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < H * 4 * H; ++i) {
+      dwh.dptr_[i] = 0;
+    }
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < 4 * H; ++i) {
+      dbx.dptr_[i] = 0;
+      dbh.dptr_[i] = 0;
+    }
+  }
   Tensor<cpu, 4, DType> difgo(ws, Shape4(T, N, 4, H));
   Tensor<cpu, 2, DType> dh(ws + T * N * H * 4, Shape2(N, H));
   Tensor<cpu, 2, DType> dc(dh.dptr_ + N * H, Shape2(N, H));
@@ -348,13 +361,28 @@ void LstmBackwardSingleLayer(DType* ws,
   const DType beta2 = 2.0;
   const int cell_size = N * H;
   if (dhy_ptr != NULL) {
-    memcpy(dh.dptr_, dhy_ptr, cell_size * sizeof(DType));
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < cell_size; ++i) {
+      dh.dptr_[i] = dhy_ptr[i];
+    }
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < cell_size; ++i) {
+      dh.dptr_[i] = 0;
+    }
   }
   if (dcy_ptr != NULL) {
-    memcpy(dc.dptr_, dcy_ptr, cell_size * sizeof(DType));
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < cell_size; ++i) {
+      dc.dptr_[i] = dcy_ptr[i];
+    }
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < cell_size; ++i) {
+      dc.dptr_[i] = 0;
+    }
   }
 
-  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = T - 1; i >= 0; --i) {
     int t = bid ? T - 1 - i : i;
     int tnext = bid ? t + 1 : t - 1;
@@ -422,8 +450,11 @@ void LstmBackwardSingleLayer(DType* ws,
     } else {
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf, Shape2(col, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + col * T, Shape2(col, T));
-      memset(tmp_dbx.dptr_, 0, col * T * sizeof(DType));
-      memset(tmp_dbh.dptr_, 0, col * T * sizeof(DType));
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < col * T; ++i) {
+        tmp_dbx.dptr_[i] = 0;
+        tmp_dbh.dptr_[i] = 0;
+      }
       for (int t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int j = 0; j < col; ++j) {
@@ -618,11 +649,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
     //  perform the second direction
     if (D == 2) {
       gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
-      Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
+      Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
       Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
           (reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
       dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
-      linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
+      linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       #pragma omp parallel for num_threads(omp_threads)
       for (int i = 0; i < N; ++i) {
@@ -839,11 +870,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
       zt = back_gateZ + (T - 1 - t) * N * H;
       nt = back_gateN + (T - 1 - t) * N * H;
       gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
-      Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
+      Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
       Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
           (reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
       dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
-      linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
+      linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
@@ -949,7 +980,11 @@ void GruForwardTraining(DType* ws,
     }
     wh_l = wx_l + I * 3 * H;
   }
-  memcpy(y_ptr, y_l, T * N * H * D * sizeof(DType));
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < T * N * H * D; ++i) {
+    y_ptr[i] = y_l[i];
+  }
 }
 
 template <typename DType>
@@ -1012,6 +1047,17 @@ void GruBackwardSingleLayer(DType* ws,
   const Tensor<cpu, 2, DType> back_wx(back_wx_ptr, Shape2(H * 3, I));
   const Tensor<cpu, 2, DType> back_wh(back_wh_ptr, Shape2(H * 3, H));
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  if (req_params != kNullOp && req_params != kAddTo) {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < D * H * 3 * H; ++i) {
+      dwh[i] = 0;
+    }
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < D * 3 * H; ++i) {
+      dbx[i] = 0;
+      dbh[i] = 0;
+    }
+  }
   #pragma omp parallel for num_threads(omp_threads)
   for (int i = 0; i < N * H; ++i) {
     if (dhy_ptr) {
@@ -1121,8 +1167,11 @@ void GruBackwardSingleLayer(DType* ws,
     } else {
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
-      memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
-      memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < H * T * 3; ++i) {
+        tmp_dbx.dptr_[i] = 0;
+        tmp_dbh.dptr_[i] = 0;
+      }
 
       for (int t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
@@ -1236,9 +1285,11 @@ void GruBackwardSingleLayer(DType* ws,
       } else {
         const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
         const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
-        memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
-        memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));
-
+        #pragma omp parallel for num_threads(omp_threads)
+        for (int i = 0; i < H * T * 3; ++i) {
+          tmp_dbx.dptr_[i] = 0;
+          tmp_dbh.dptr_[i] = 0;
+        }
         for (int t = T - 1; t >= 0; --t) {
           #pragma omp parallel for num_threads(omp_threads)
           for (int i = 0; i < 3 * H; ++i) {
@@ -1272,7 +1323,10 @@ void GruBackwardSingleLayer(DType* ws,
     }
   }
   if (req_state != kNullOp) {
-    memcpy(dhx, dht1, N * H * D * sizeof(DType));
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < N * H * D; ++i) {
+      dhx[i] = dht1[i];
+    }
   }
 }
 
@@ -1335,6 +1389,7 @@ void GruBackward(DType* ws,
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
   int inputsize = I;
   DType* y_tmp = y_l - T * N * H * D;
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int l = L - 1; l >= 0; --l) {
     if (l == 0) {
       I = inputsize;
@@ -1349,7 +1404,10 @@ void GruBackward(DType* ws,
                                   dhy_l, gateR_l, gateZ_l, gateN_l, Mnh_l, dx_l, dhx_l,
                                   dwx_l, dwh_l, dbx_l, dbh_l, req_data, req_params, req_state);
     if (l > 0) {
-      memcpy(dy_l, dx_l, T * N * H * D * sizeof(DType));
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < T * N * H * D; ++i) {
+        dy_l[i] = dx_l[i];
+      }
       gateR_l = gateR_l - T * D * N * H;
       gateZ_l = gateZ_l - T * D * N * H;
       gateN_l = gateN_l - T * D * N * H;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0c68ae233b6..165d419987b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -115,8 +115,6 @@ def test_gru_sym():
     check_rnn_consistency(fused, stack, T, N, I, H, 'add')
     check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
-
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11219")
 @with_seed()
 def test_gru_bidirectional():
     T, N, I, H = 5, 20, 800, 800
@@ -134,12 +132,11 @@ def test_gru_bidirectional():
                 mx.rnn.GRUCell(H, prefix='l1_'),
                 mx.rnn.GRUCell(H, prefix='r1_'),
                 output_prefix='bi_gru_1_'))
-    
+
     check_rnn_consistency(fused, stack, T, N, I, H, 'write')
     check_rnn_consistency(fused, stack, T, N, I, H, 'add')
     check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
-
 # Currently, fused LSTM operator doesn't support dropout.
 # Will change this test after dropout is supported
 @with_seed()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services