You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/01/12 06:16:00 UTC
[GitHub] eric-haibin-lin closed pull request #13852: Fix Tree Reduction on new instance type p3dn.24xlarge

eric-haibin-lin closed pull request #13852: Fix Tree Reduction on new instance type p3dn.24xlarge
URL: https://github.com/apache/incubator-mxnet/pull/13852
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index b62228cd288..11d99c02191 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
       //  BroadcastRowSparse
       InitMergeBuffer(devs_);
       InitMergeBufferTree();
-      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
-        EnableP2P();
-      }
     }
   }
 
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
   }
 
  private:
-  void EnableP2P() {
+  void EnableP2P(std::vector<int>* p2p) {
 #if MXNET_USE_CUDA
     std::vector<int> gpus;
     for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
     }
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
-    std::vector<int> p2p(n*n);
+    p2p->clear();
+    p2p->resize(n*n, 0);
     for (int i = 0; i < n; ++i) {
       mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
           cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
           if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
             ++enabled;
-            p2p[i*n+j] = 1;
+            (*p2p)[i*n+j] = 1;
           }
         }
       }
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
       std::string access(n, '.');
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < n; ++j) {
-          access[j] = p2p[i*n+j] ? 'v' : '.';
+          access[j] = (*p2p)[i*n+j] ? 'v' : '.';
         }
         LOG(WARNING) << access;
       }
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
   void QueryTopology() {
 #if MXNET_USE_CUDA
     std::vector<float> link_matrix(devs_.size()*devs_.size());
-    GetP2PWeight(devs_, &link_matrix);
+    std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+    EnableP2P(&p2p_matrix);
+    GetP2PWeight(devs_, p2p_matrix, &link_matrix);
     if (backtrack_)
       LOG(INFO) << "Using Backtracking to generate trees";
     else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 92bcf1a5d26..777fb47f994 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
 /**
  * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
  * \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ *          0: no P2P connection
+ *          1: P2P connection
  * \param matrix is adjacency matrix of link topology graph
  *        where edge weight represents relative performance of NVIDIA GPUs
  *          0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
  *          3: 2 NVLink connections
  */
 template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+                         const std::vector<int>& p2p_matrix,
+                         std::vector<T>* matrix) {
   int num_gpus = devs.size();
   int count    = 0;
   std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,61 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
     }
   }
 
-  // Check that all GPUs have at least 1 NVLink connection
-  int max_value = 0;
-  for (unsigned int i = 0; i < max.size(); ++i) {
-    if (max[i] > max_value)
-      max_value = max[i];
+  // Check that all P2P connections are detected by GetP2PAttribute
+  // If yes, then continue as before
+  // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+  //
+  // We have observed that with CUDA 9.0 p3.16xlarge:
+  //
+  //   0 2 2 3 3 1 1 1    . v v v v . . .
+  //   2 0 3 2 1 3 1 1    v . v v . v . .
+  //   2 3 0 3 1 1 2 1    v v . v . . v .
+  //   3 2 3 0 1 1 1 2    v v v . . . . v
+  //   3 1 1 1 0 2 2 3    v . . . . v v v
+  //   1 3 1 1 2 0 3 2    . v . . v . v v
+  //   1 1 2 1 2 3 0 3    . . v . v v . v
+  //   1 1 1 2 3 2 3 0    . . . v v v v .
+  //
+  //        matrix           p2p_matrix
+  //
+  // Here, they are correctly detected, because the 2s and 3s correspond to
+  // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+  //
+  //   0 2 2 1 1 1 1 1    . v v v v . . .
+  //   2 0 1 2 1 1 1 1    v . v v . v . .
+  //   2 1 0 1 1 1 2 1    v v . v . . v .
+  //   1 2 1 0 1 1 1 2    v v v . . . . v
+  //   1 1 1 1 0 2 2 1    v . . . . v v v
+  //   1 1 1 1 2 0 1 2    . v . . v . v v
+  //   1 1 2 1 2 1 0 1    . . v . v v . v
+  //   1 1 1 2 1 2 1 0    . . . v v v v .
+  //
+  //        matrix          p2p_matrix
+  //
+  // The fastest connections (3 - double NVLink) are not recognized as being any
+  if (kLogTree) {
+    PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
+    PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
+  }
+
+  // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+  bool matrix_correct = true;
+  for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+    if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+      matrix_correct = false;
+      break;
+    }
+  }
+
+  if (!matrix_correct) {
+    LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
+                 << "Falling back to cudaDeviceEnablePeerAccess for topology detection";
+    for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+      if (p2p_matrix[i] > 0)
+        (*matrix)[i] = 2;
+      else
+        (*matrix)[i] = 1;
+    }
   }
 
   // If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +243,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
       matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
     }
   }
+  if (kLogTree)
+    PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
 
 #else
   LOG(WARNING) << "GPU required for link topology";


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services