You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/05/15 05:29:19 UTC

[GitHub] [incubator-mxnet] szha commented on a change in pull request #14542: Support SyncBatchNorm5D

szha commented on a change in pull request #14542: Support SyncBatchNorm5D
URL: https://github.com/apache/incubator-mxnet/pull/14542#discussion_r284091455
 
 

 ##########
 File path: tests/python/unittest/test_gluon.py
 ##########
 @@ -583,6 +583,126 @@ def test_batchnorm():
     check_layer_forward(layer, (2, 10, 10, 10))
 
 
+@with_seed()
+def test_sync_batchnorm():
+    def _check_batchnorm_result(input, num_devices=1, cuda=False):
+        from mxnet.gluon.utils import split_and_load
+
+        def _find_bn(module):
+            if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+                return module
+            elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+                return module.module
+
+            raise RuntimeError('BN not found')
+
+        def _syncParameters(bn1, bn2, ctx):
+            ctx = input.context
+            bn2.gamma.set_data(bn1.gamma.data(ctx))
+            bn2.beta.set_data(bn1.beta.data(ctx))
+            bn2.running_mean.set_data(bn1.running_mean.data(ctx))
+            bn2.running_var.set_data(bn1.running_var.data(ctx))
+
+        input1 = input.copy()
+        input2 = input.copy()
+
+        if cuda:
+            input1 = input.as_in_context(mx.gpu(0))
+            ctx_list = [mx.gpu(i) for i in range(num_devices)]
+        else:
+            ctx_list = [mx.cpu(0) for _ in range(num_devices)]
+
+        nch = input.shape[1] if input.ndim > 1 else 1
+        bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
+        bn2 = mx.gluon.contrib.nn.SyncBatchNorm(
+            in_channels=nch, num_devices=num_devices)
+
+        bn1.initialize(ctx=ctx_list[0])
+        bn2.initialize(ctx=ctx_list)
+
+        # using the same values for gamma and beta
+        #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
+
+        input1.attach_grad()
+        inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+        for xi in inputs2:
+            xi.attach_grad()
+
+        with mx.autograd.record():
+            output1 = bn1(input1)
+            output2 = [bn2(xi) for xi in inputs2]
+            loss1 = (output1 ** 2).sum()
+            loss2 = [(output ** 2).sum() for output in output2]
+            mx.autograd.backward(loss1)
+            mx.autograd.backward(loss2)
+
+        output2 = mx.nd.concat(*[output.as_in_context(input.context)
+                                 for output in output2], dim=0)
+        # check bn1
+
+        momentum = 0.9
+        epsilon = 1e-5
+        axis = 1
+        data = input1
+        running_mean = mx.nd.zeros(nch, ctx=data.context)
+        running_var = mx.nd.ones(nch, ctx=data.context)
+
+        data_mean = data.mean(
+            axis=axis, exclude=True, keepdims=True)
+        data_var = (data - data_mean).square().mean(axis=axis,
+                                                    exclude=True, keepdims=True)
+
+        target_output = (data - data_mean) / (data_var + epsilon).sqrt()
+
+        # squeeze data_mean and data_var
+        data_mean_flat = data_mean.squeeze()
+        data_var_flat = data_var.squeeze()
+
+        running_mean = running_mean * momentum + \
+            data_mean_flat * (1 - momentum)
+        running_var = running_var * momentum + \
+            data_var_flat * (1 - momentum)
+
+        atol = 1e-2
+        rtol = 1e-2
+        assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+                            running_mean.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+                            running_var.asnumpy(),
+                            atol=atol, rtol=rtol)
+        # assert forwarding
+        assert_almost_equal(input1.asnumpy(), input2.asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(output1.asnumpy(),
+                            output2.asnumpy(), atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+                            _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
+                            atol=atol, rtol=rtol)
+        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+                            _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
+                            atol=atol, rtol=rtol)
+        input2grad = mx.nd.concat(
+            *[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
+        assert_almost_equal(input1.grad.asnumpy(),
+                            input2grad.asnumpy(), atol=atol, rtol=rtol)
+
+    cfgs = [(1, False)]
+    num_gpus = mx.context.num_gpus()
 
 Review comment:
   This line requires having GPU when CUDA is installed, or it would throw this error:
   ```
   ======================================================================
   ERROR: test_gluon.test_sync_batchnorm
   ----------------------------------------------------------------------
   Traceback (most recent call last):
     File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest
       self.test(*self.arg)
     File "/home/travis/build/dmlc/mxnet-distro/mxnet-build/tests/python/unittest/common.py", line 177, in test_new
       orig_test(*args, **kwargs)
     File "/home/travis/build/dmlc/mxnet-distro/mxnet-build/tests/python/unittest/test_gluon.py", line 693, in test_sync_batchnorm
       num_gpus = mx.context.num_gpus()
     File "/home/travis/build/dmlc/mxnet-distro/mxnet/context.py", line 258, in num_gpus
       check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
     File "/home/travis/build/dmlc/mxnet-distro/mxnet/base.py", line 254, in check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   MXNetError: [11:47:54] include/mxnet/base.h:427: Check failed: e == cudaSuccess (30 vs. 0) :  CUDA: unknown error
   Stack trace:
     [bt] (0) /home/travis/build/dmlc/mxnet-distro/mxnet/libmxnet.so(+0x4b60fb) [0x7f8d608830fb]
     [bt] (1) /home/travis/build/dmlc/mxnet-distro/mxnet/libmxnet.so(+0x2440eec) [0x7f8d6280deec]
     [bt] (2) /home/travis/build/dmlc/mxnet-distro/mxnet/libmxnet.so(MXGetGPUCount+0x19) [0x7f8d6280df79]
     [bt] (3) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f8d9a2e1c7c]
     [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x1fc) [0x7f8d9a2e15ac]
     [bt] (5) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(_ctypes_callproc+0x48e) [0x7f8d9a4f85fe]
     [bt] (6) /usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so(+0x15f9e) [0x7f8d9a4f9f9e]
     [bt] (7) /usr/bin/python(PyEval_EvalFrameEx+0x965) [0x4c84a5]
     [bt] (8) /usr/bin/python(PyEval_EvalCodeEx+0x2ac) [0x4cfedc]
   -------------------- >> begin captured logging << --------------------
   common: INFO: Setting test np/mx/python random seeds, use MXNET_TEST_SEED=1179889124 to reproduce.
   --------------------- >> end captured logging << ---------------------
   ----------------------------------------------------------------------
   ```
   
   Can you please move this test to tests/python/gpu/test_gluon_contrib_gpu.py? @wkcn @zhreshold 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services