You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/12 02:35:50 UTC

[GitHub] huabinhuang1994 opened a new issue #9031: cuda error:Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error

huabinhuang1994 opened a new issue #9031:  cuda error:Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error
URL: https://github.com/apache/incubator-mxnet/issues/9031
 
 
   1.my gpu is 1080ti,my system is ubuntu16.04.
   2. I install cuda 8.0 on my computor , and tensorflow and pytorch cpu version works fine on my computor.
   3.I install mxnet gpu version from source.
   4.Here is my code:
   from mxnet import autograd
   import mxnet as mx
   import os,json,time,fire,ipdb,tqdm
   from mxnet import gluon
   from mxnet import image
   from mxnet import init
   from mxnet import nd
   from mxnet.gluon.data import vision
   import numpy as np
   from mxnet import initializer
   batch_size=28
   num_classes = 80
   data_ctx = mx.cpu()
   model_ctx = mx.gpu()
   train_auglist = image.CreateAugmenter(data_shape=(3, 256, 256), resize=430,
                       rand_crop=True, rand_resize=False, rand_mirror=False,
                       # mean=np.array([0.4914, 0.4822, 0.4465]),
                       mean=True,
                       # std=np.array([0.2023, 0.1994, 0.2010]),
                       std=True,
                       brightness=0, contrast=0,
                       saturation=0, hue=0,
                       pca_noise=0, rand_gray=0, inter_method=2)
   
   val_auglist = image.CreateAugmenter(data_shape=(3, 256, 256), resize=430,
                       rand_crop=False, rand_resize=False, rand_mirror=False,
                       # mean=np.array([0.4914, 0.4822, 0.4465]),
                       mean=True,
                       # std=np.array([0.2023, 0.1994, 0.2010]),
                       std=True,
                       brightness=0, contrast=0,
                       saturation=0, hue=0,
                       pca_noise=0, rand_gray=0, inter_method=2)
   def transform_test(data, label):
       im = data.astype('float32') / 255
       auglist = image.CreateAugmenter(data_shape=(3, 256, 256),
                           mean=np.array([0.4914, 0.4822, 0.4465]),
                           std=np.array([0.2023, 0.1994, 0.2010]))
       for aug in auglist:
           im = aug(im)
       im = nd.transpose(im, (2,0,1))
       return (im, nd.array([label]).asscalar().astype('float32'))
   def get_iterators(batch_size, data_shape=(3, 256, 256)):
       train = mx.image.ImageIter(
           path_imgrec         = '/home/hhb/dataset/scene_data_train.rec',
           data_name           = 'data',
           label_name          = 'softmax_label',
           batch_size          = batch_size,
           data_shape          = data_shape,
           path_imgidx='/home/hhb/dataset/scene_data_train.idx',
           shuffle             = True,
           aug_list=train_auglist,)
       val = mx.image.ImageIter(
           path_imgrec ='/home/hhb/dataset/scene_data_val.rec',
           data_name = 'data',
           label_name = 'softmax_label',
           batch_size          = batch_size,
           data_shape          = data_shape,
           path_imgidx = '/home/hhb/dataset/scene_data_val.idx',
           shuffle =False,
           aug_list=val_auglist,)
       return (train, val)
   def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'):
       all_layers = symbol.get_internals()
       net = all_layers[layer_name+'_output']
       net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1')
       net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
       new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})
       return (net, new_args)
   
   def fit(symbol, arg_params, aux_params, train, val, batch_size, num_gpus):
       dev = [mx.gpu(i) for i in range(num_gpus)]
       mod = mx.mod.Module(symbol=symbol, context=mx.cpu())
       mod.fit(train, val,
            num_epoch=8,
            arg_params=arg_params,
            aux_params=aux_params,
            allow_missing=True,
            batch_end_callback = mx.callback.Speedometer(batch_size, 10),
            kvstore='device',
            optimizer='sgd',
            optimizer_params={'learning_rate':0.01},
            initializer=mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2),
            eval_metric='acc')
       metric = mx.metric.Accuracy()
       return mod.score(val, metric)
   sym, arg_params, aux_params=mx.model.load_checkpoint('resnet-50',0)
   (new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)
   (train, val) = get_iterators(batch_size)
   mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus=1)
   
   
   here is my error:
   /home/hhb/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py:46: DeprecationWarning: OpenSSL.rand is deprecated - you should use os.urandom instead
     import OpenSSL.SSL
   /home/hhb/anaconda3/lib/python3.6/site-packages/jedi/_compatibility.py:6: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
     import imp
   [10:33:27] src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version v0.8.0. Attempting to upgrade...
   [10:33:27] src/nnvm/legacy_json_util.cc:217: Symbol successfully upgraded!
   [10:33:28] /home/travis/build/dmlc/mxnet-distro/mxnet-build/dmlc-core/include/dmlc/logging.h:308: [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error
   
   Stack trace returned 10 entries:
   [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c]
   [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e]
   [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d]
   [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612]
   [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd]
   [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894]
   [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c]
   [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048]
   [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a]
   [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564]
   
   Traceback (most recent call last):
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 1488, in simple_bind
       ctypes.byref(exe_handle)))
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/base.py", line 146, in check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   mxnet.base.MXNetError: [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error
   
   Stack trace returned 10 entries:
   [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c]
   [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e]
   [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d]
   [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612]
   [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd]
   [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894]
   [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c]
   [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048]
   [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a]
   [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564]
   
   
   During handling of the above exception, another exception occurred:
   
   Traceback (most recent call last):
     File "/home/hhb/mxnetproject/new_scene.py", line 90, in <module>
       mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus=1)
     File "/home/hhb/mxnetproject/new_scene.py", line 84, in fit
       eval_metric='acc')
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/base_module.py", line 460, in fit
       for_training=True, force_rebind=force_rebind)
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py", line 428, in bind
       state_names=self._state_names)
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 237, in __init__
       self.bind_exec(data_shapes, label_shapes, shared_group)
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 333, in bind_exec
       shared_group))
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 611, in _bind_ith_exec
       shared_buffer=shared_data_arrays, **input_shapes)
     File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 1494, in simple_bind
       raise RuntimeError(error_msg)
   RuntimeError: simple_bind error. Arguments:
   data: (28, 3, 256, 256)
   softmax_label: (28,)
   [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error
   
   Stack trace returned 10 entries:
   [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c]
   [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e]
   [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d]
   [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612]
   [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd]
   [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894]
   [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c]
   [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048]
   [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a]
   [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564]
   
   I have tried everyting I can find on the internet ,but they do not work for me. I dont know what to do next.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services