You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/11/28 15:53:09 UTC

[GitHub] lebeg closed pull request #12477: Added build with nccl

lebeg closed pull request #12477: Added build with nccl
URL: https://github.com/apache/incubator-mxnet/pull/12477
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite
index 8a763892a97..226e82c5309 160000
--- a/3rdparty/ps-lite
+++ b/3rdparty/ps-lite
@@ -1 +1 @@
-Subproject commit 8a763892a973afc1acd3d4b469d05bb338a83a6e
+Subproject commit 226e82c53095d64866821245024d5342996787f4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 42f6bffb920..e3036ba99f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake)
 #Some things have order. This must be put in front alone
 mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
 mxnet_option(USE_OLDCMAKECUDA     "Build with old cmake cuda" OFF)
-mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" OFF)
+mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" ON)
 mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
 mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
diff --git a/Makefile b/Makefile
index ad7f0ff3485..1fa5df34e60 100644
--- a/Makefile
+++ b/Makefile
@@ -74,8 +74,7 @@ endif
 include $(TPARTYDIR)/mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
 
-# all tge possible warning tread
-WARNFLAGS= -Wall -Wsign-compare
+WARNFLAGS = -Wall -Wsign-compare
 CFLAGS = -DMSHADOW_FORCE_STREAM $(WARNFLAGS)
 
 ifeq ($(DEV), 1)
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
index 8fcbcbbb967..075be934493 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -60,6 +60,9 @@ RUN /work/ubuntu_caffe.sh
 COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
+COPY install/ubuntu_nccl.sh /work
+RUN /work/ubuntu_nccl.sh
+
 COPY install/ubuntu_docs.sh /work/
 COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
diff --git a/ci/docker/install/ubuntu_nccl.sh b/ci/docker/install/ubuntu_nccl.sh
new file mode 100755
index 00000000000..965b6b4fae0
--- /dev/null
+++ b/ci/docker/install/ubuntu_nccl.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+apt update && apt install libnccl2 libnccl-dev -y --no-upgrade
diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh
index 98774754e9b..51502723f71 100755
--- a/ci/docker/install/ubuntu_tutorials.sh
+++ b/ci/docker/install/ubuntu_tutorials.sh
@@ -22,6 +22,6 @@
 
 set -ex
 apt-get update || true
-apt-get install graphviz python-opencv
+apt-get install graphviz python-opencv -y
 pip2 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm
 pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 39631f9dc7e..6c7dd557aa3 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -305,6 +305,7 @@ build_centos7_gpu() {
         USE_CUDNN=1                               \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_NCCL=1                                \
         -j$(nproc)
 }
 
@@ -536,6 +537,7 @@ build_ubuntu_gpu_tensorrt() {
         USE_GPERFTOOLS=0                                     \
         ONNX_NAMESPACE=onnx                                  \
         CUDA_ARCH="-gencode arch=compute_70,code=compute_70" \
+        USE_NCCL=1                                           \
         -j$(nproc)
 }
 
@@ -554,6 +556,7 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_NCCL=1                                \
         -j$(nproc)
 }
 
@@ -571,6 +574,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=0                               \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_NCCL=1                                \
         -j$(nproc)
 }
 
@@ -588,6 +592,7 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
+        USE_NCCL=1                                \
         -j$(nproc)
 }
 
diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h
index d0f397cc61b..3cce588d0e9 100644
--- a/src/kvstore/kvstore_nccl.h
+++ b/src/kvstore/kvstore_nccl.h
@@ -341,6 +341,7 @@ class KVStoreNCCL : public KVStoreLocal {
         }
       } else {
         auto& buf = merge_buf_[key];
+        (void)buf;
         int root = src.ctx().dev_id;
         assert(root == buf.ctx().dev_id);
         root_id = FindRootId(dst, root);
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 301fc48d212..85a99492ca3 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -29,17 +29,19 @@
 namespace mxnet {
 namespace op {
 
+namespace {
+
 // infer storage function for _identity_with_attr_like_rhs op
-static bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
-                                           const int dev_mask,
-                                           DispatchMode* dispatch_mode,
-                                           std::vector<int> *in_attrs,
-                                           std::vector<int> *out_attrs) {
+bool IdentLikeRhsStorageType(const nnvm::NodeAttrs &attrs,
+                             const int /*dev_mask*/,
+                             DispatchMode *dispatch_mode,
+                             std::vector<int> *in_attrs,
+                             std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
-  const auto& rhs_stype = in_attrs->at(1);
-  auto& lhs_stype = in_attrs->at(0);
-  auto& out_stype = out_attrs->at(0);
+  const auto &rhs_stype = in_attrs->at(1);
+  auto &lhs_stype = in_attrs->at(0);
+  auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
 
   CHECK_NE(rhs_stype, kUndefinedStorage);
@@ -69,6 +71,8 @@ static bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
+}  // namespace
+
 // relu
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(relu, cpu, mshadow_op::relu)
 .describe(R"code(Computes rectified linear.
@@ -337,7 +341,7 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeFirstItemEx<cpu>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<FInferStorageType>("FInferStorageType", IdentityAttrLikeRhsStorageType)
+.set_attr<FInferStorageType>("FInferStorageType", IdentLikeRhsStorageType)
 .set_attr<nnvm::FGradient>(
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 40ef6fdfd0a..2361b0e39ea 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -17,37 +17,54 @@
 
 import mxnet as mx
 import numpy as np
+import copy
+import logging
+
 import unittest
-import os
 
-shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
-keys = [1,2,3,4,5,6,7]
-num_gpus = len(mx.test_utils.list_gpus())
+num_gpus = min(8, mx.context.num_gpus())
+
+
+class TestNCCL(unittest.TestCase):
+    shapes = [1, 10, 100, 1000, 10000, 100000, (2, 2), (2, 3, 4, 5, 6, 7, 8)]
+    tensors = {}
+    kv_nccl = None
+
+    @classmethod
+    def setUpClass(cls):
+        if num_gpus == 0:
+            raise unittest.SkipTest("No GPUs available")
+        if num_gpus < 2:
+            raise unittest.SkipTest("It makes sense to test NCCL functionality on more than 1 GPU only")
+        if num_gpus > 8:
+            logging.info("The machine has {} GPUs. We will run the test on not more than 8 GPUs.".format(cls.num_gpus))
+            logging.info("There is a limit of 8 maximum P2P peers for all PCI-E hardware created.")
 
+    def setUp(self):
+        self.kv_nccl = mx.kv.create('nccl')
+        for gpu_index in range(num_gpus):
+            shapes = copy.deepcopy(self.shapes)
+            np.random.shuffle(shapes)
+            self.tensors[gpu_index] = [np.random.random_sample(shape) for shape in shapes]
+            log = "GPU {}: {}".format(gpu_index, ' '.join(str(tensor.shape) for tensor in self.tensors[gpu_index]))
+            logging.info(log)
 
-if num_gpus > 8 :
-    print("The machine has {} gpus. We will run the test on 8 gpus.".format(num_gpus))
-    print("There is a limit for all PCI-E hardware on creating number of P2P peers. The limit is 8.")
-    num_gpus = 8;
+    def push_shapes(self):
+        for gpu_index in range(num_gpus):
+            tensors = [mx.nd.array(tensor, mx.gpu(gpu_index)) for tensor in self.tensors[gpu_index]]
+            self.kv_nccl.push(gpu_index, tensors)
 
-gpus = range(1, 1+num_gpus)
+    def test_push_pull(self):
+        self.push_shapes()
 
-@unittest.skip("Test requires NCCL library installed and enabled during build")
-def test_nccl_pushpull():
-    for shape, key in zip(shapes, keys):
-        for n_gpus in gpus:
-            kv_nccl = mx.kv.create('nccl')
-            a = mx.nd.ones(shape, mx.gpu(0))
-            cur_key = str(key*max(gpus)+n_gpus)
-            kv_nccl.init(cur_key, a)
-            arr_list = [mx.nd.ones(shape, mx.gpu(x)) for x in range(n_gpus)]
-            res = [mx.nd.zeros(shape, mx.gpu(x)) for x in range(n_gpus)]
-            kv_nccl.push(cur_key, arr_list)
-            kv_nccl.pull(cur_key, res)
-            for x in range(n_gpus):
-                assert(np.sum(np.abs((res[x]-n_gpus).asnumpy()))==0)
+        for gpu_index in range(num_gpus):
+            for gpu_index2 in range(num_gpus):
+                if gpu_index == gpu_index2:
+                    continue
+                pulled_tensors = [mx.nd.zeros(tensor.shape, mx.gpu(gpu_index)) for tensor in self.tensors[gpu_index2]]
+                self.kv_nccl.pull(gpu_index2, pulled_tensors)
+                assert np.allclose(pulled_tensors, self.tensors[gpu_index2])
 
-    print ("Passed")
 
 if __name__ == '__main__':
-    test_nccl_pushpull()
+    unittest.main()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services