You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2019/01/12 06:16:19 UTC
[incubator-mxnet] branch master updated: Fix Tree Reduction on new
instance type p3dn.24xlarge (#13852)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 0d76675 Fix Tree Reduction on new instance type p3dn.24xlarge (#13852)
0d76675 is described below
commit 0d766750695ef7c0f3152a4f7556eb951ad24688
Author: Carl Yang <ca...@gmail.com>
AuthorDate: Fri Jan 11 22:15:58 2019 -0800
Fix Tree Reduction on new instance type p3dn.24xlarge (#13852)
* add fallback for gpu topology detection using CUDA 9.2
* add fallback for gpu topology detection using CUDA 9.2
* add log
* update 3rdparty to master
* add fallback for gpu topology detection using CUDA 9.2
* add log
* update 3rdparty to master
* bring 3rdparty packages to upstream/master
* rebase to master
* Update gpu_topology.h
---
src/kvstore/comm_tree.h | 16 +++++------
src/kvstore/gpu_topology.h | 69 ++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 71 insertions(+), 14 deletions(-)
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index b62228c..11d99c0 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -77,9 +77,6 @@ class CommDeviceTree : public CommDevice {
// BroadcastRowSparse
InitMergeBuffer(devs_);
InitMergeBufferTree();
- if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
- EnableP2P();
- }
}
}
@@ -328,7 +325,7 @@ class CommDeviceTree : public CommDevice {
}
private:
- void EnableP2P() {
+ void EnableP2P(std::vector<int>* p2p) {
#if MXNET_USE_CUDA
std::vector<int> gpus;
for (const auto& d : devs_) {
@@ -338,7 +335,8 @@ class CommDeviceTree : public CommDevice {
}
int n = static_cast<int>(gpus.size());
int enabled = 0;
- std::vector<int> p2p(n*n);
+ p2p->clear();
+ p2p->resize(n*n, 0);
for (int i = 0; i < n; ++i) {
mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
@@ -348,7 +346,7 @@ class CommDeviceTree : public CommDevice {
cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
++enabled;
- p2p[i*n+j] = 1;
+ (*p2p)[i*n+j] = 1;
}
}
}
@@ -362,7 +360,7 @@ class CommDeviceTree : public CommDevice {
std::string access(n, '.');
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
- access[j] = p2p[i*n+j] ? 'v' : '.';
+ access[j] = (*p2p)[i*n+j] ? 'v' : '.';
}
LOG(WARNING) << access;
}
@@ -373,7 +371,9 @@ class CommDeviceTree : public CommDevice {
void QueryTopology() {
#if MXNET_USE_CUDA
std::vector<float> link_matrix(devs_.size()*devs_.size());
- GetP2PWeight(devs_, &link_matrix);
+ std::vector<int> p2p_matrix(devs_.size()*devs_.size());
+ EnableP2P(&p2p_matrix);
+ GetP2PWeight(devs_, p2p_matrix, &link_matrix);
if (backtrack_)
LOG(INFO) << "Using Backtracking to generate trees";
else
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
index 92bcf1a..777fb47 100644
--- a/src/kvstore/gpu_topology.h
+++ b/src/kvstore/gpu_topology.h
@@ -123,6 +123,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
/**
* \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
* \param devs is a vector of GPU contexts
+ * \param p2p_matrix is adjacency matrix of P2P connections where
+ * 0: no P2P connection
+ * 1: P2P connection
* \param matrix is adjacency matrix of link topology graph
* where edge weight represents relative performance of NVIDIA GPUs
* 0: Self-connection
@@ -131,7 +134,9 @@ inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
* 3: 2 NVLink connections
*/
template <typename T>
-inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+inline void GetP2PWeight(const std::vector<Context>& devs,
+ const std::vector<int>& p2p_matrix,
+ std::vector<T>* matrix) {
int num_gpus = devs.size();
int count = 0;
std::vector<int> zero_dev_id(num_gpus, -1);
@@ -161,11 +166,61 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
}
}
- // Check that all GPUs have at least 1 NVLink connection
- int max_value = 0;
- for (unsigned int i = 0; i < max.size(); ++i) {
- if (max[i] > max_value)
- max_value = max[i];
+ // Check that all P2P connections are detected by GetP2PAttribute
+ // If yes, then continue as before
+ // If not, then treat fallback to using p2p_matrix (from EnableP2P)
+ //
+ // We have observed that with CUDA 9.0 p3.16xlarge:
+ //
+ // 0 2 2 3 3 1 1 1 . v v v v . . .
+ // 2 0 3 2 1 3 1 1 v . v v . v . .
+ // 2 3 0 3 1 1 2 1 v v . v . . v .
+ // 3 2 3 0 1 1 1 2 v v v . . . . v
+ // 3 1 1 1 0 2 2 3 v . . . . v v v
+ // 1 3 1 1 2 0 3 2 . v . . v . v v
+ // 1 1 2 1 2 3 0 3 . . v . v v . v
+ // 1 1 1 2 3 2 3 0 . . . v v v v .
+ //
+ // matrix p2p_matrix
+ //
+ // Here, they are correctly detected, because the 2s and 3s correspond to
+ // links that have P2P connections between them. However for CUDA 9.2 p3.16xlarge:
+ //
+ // 0 2 2 1 1 1 1 1 . v v v v . . .
+ // 2 0 1 2 1 1 1 1 v . v v . v . .
+ // 2 1 0 1 1 1 2 1 v v . v . . v .
+ // 1 2 1 0 1 1 1 2 v v v . . . . v
+ // 1 1 1 1 0 2 2 1 v . . . . v v v
+ // 1 1 1 1 2 0 1 2 . v . . v . v v
+ // 1 1 2 1 2 1 0 1 . . v . v v . v
+ // 1 1 1 2 1 2 1 0 . . . v v v v .
+ //
+ // matrix p2p_matrix
+ //
+ // The fastest connections (3 - double NVLink) are not recognized as being any
+ if (kLogTree) {
+ PrintMatrix("matrix", *matrix, num_gpus, num_gpus);
+ PrintMatrix("p2p_matrix", p2p_matrix, num_gpus, num_gpus);
+ }
+
+ // different from (1 - non-P2P PCI-E). This is why we fallback to p2p_matrix.
+ bool matrix_correct = true;
+ for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+ if (p2p_matrix[i] > 0 && (*matrix)[i] == 1) {
+ matrix_correct = false;
+ break;
+ }
+ }
+
+ if (!matrix_correct) {
+ LOG(WARNING) << "cudaDeviceGetP2PAttribute incorrect. "
+ << "Falling back to cudaDeviceEnablePeerAccess for topology detection";
+ for (unsigned i = 0; i < p2p_matrix.size(); ++i) {
+ if (p2p_matrix[i] > 0)
+ (*matrix)[i] = 2;
+ else
+ (*matrix)[i] = 1;
+ }
}
// If all GPUs are connected by NVLink, then we can use NVLink only
@@ -188,6 +243,8 @@ inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matri
matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
}
}
+ if (kLogTree)
+ PrintMatrix("Weight", *matrix, num_gpus, num_gpus);
#else
LOG(WARNING) << "GPU required for link topology";