You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/13 01:55:29 UTC

[GitHub] fedorzh opened a new issue #9410: Training with the same parameters and seed gets significantly different results

fedorzh opened a new issue #9410: Training with the same parameters and seed gets significantly different results
URL: https://github.com/apache/incubator-mxnet/issues/9410
 
 
   Train with the same parameters and seeds, get different results. Repro:
   
   ```
   
   # coding: utf-8
   
   # In[1]:
   
   
   import mxnet as mx
   from mxnet import nd, gluon, autograd, ndarray
   import numpy as np
   import random
   
   def transform(data, label):
       return [dat.astype(np.float32) for dat in data], [lab.astype(np.float32) for lab in label]
   
   
   train_cifar_gluon = gluon.data.vision.CIFAR10(train=True, transform=transform)
   test_cifar_gluon = gluon.data.vision.CIFAR10(train=False, transform=transform)
   
   def convert_gluon_dataset_to_numpy(data):
       ds = data[:][0][0].shape
       X = np.empty((len(data[:][0]), ds[2], ds[0], ds[1]), dtype=np.float32)
       for i, example in enumerate(data[:][0]):
           X[i, :] = np.rollaxis(example.asnumpy(),2)
       y = np.array(data[:][1])
       return X, y
   
   X, y = convert_gluon_dataset_to_numpy(train_cifar_gluon)
   X_test, y_test = convert_gluon_dataset_to_numpy(test_cifar_gluon)
   
   
   # In[2]:
   
   
   def predict_scores(net, X_, batch_size, context):
       scores = None
       test_loaded = gluon.data.DataLoader(mx.nd.array(X_), batch_size, shuffle=False)
       for data in test_loaded:
           data = data.as_in_context(context)
           output = net(data).asnumpy()
           if scores is None:
               scores = output
           else:
               scores = np.append(scores, output, axis=0)
       return scores
   
   
   # In[3]:
   
   
   gpu_count = 1
   _ctx_list = [mx.gpu(i) for i in range(gpu_count)]
   _batch_size=64
   epochs=1
   _seed=42
   _optimizer='sgd'
   _learning_rate=0.1
   _xavier_magnitude=2.
   _momentum=0.9
   _wd=0.0001
   _nclasses=10
   
   
   # ### Try 1
   
   # In[4]:
   
   
   random.seed(_seed)
   mx.random.seed(_seed)
   np.random.seed(_seed)
   
   
   # In[5]:
   
   
   net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, classes=_nclasses, ctx=_ctx_list)
   
   loss = gluon.loss.SoftmaxCrossEntropyLoss()
   
   
   # In[6]:
   
   
   net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), ctx=_ctx_list, force_reinit=True)
   
   trainer = gluon.Trainer(net.collect_params(), _optimizer,
                           optimizer_params=dict(learning_rate=_learning_rate, momentum=_momentum,
                                                 wd=_wd),
                           kvstore='device' if len(_ctx_list) > 0 else 'local')
   
   train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size)
   for e in range(epochs):
       train_data.reset()
       for batch in train_data:
           cur_contexts = _ctx_list
           if batch.data[0].shape[0] < len(_ctx_list):
               cur_contexts = cur_contexts[:batch.data[0].shape[0]]
           data = gluon.utils.split_and_load(batch.data[0], ctx_list=cur_contexts, batch_axis=0, even_split=False)
           label = gluon.utils.split_and_load(batch.label[0], ctx_list=cur_contexts, batch_axis=0, even_split=False)
           Ls = []
           with autograd.record():  # Start recording the derivatives
               for x_cur, y_cur in zip(data, label):
                   L = loss(net(x_cur), y_cur)
                   # store the loss and do backward after we have done forward
                   # on all GPUs for better speed on multiple GPUs.
                   Ls.append(L)
               for L in Ls:
                   L.backward()
           trainer.step(batch.data[0].shape[0])
           
       scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0])
       predictions_test = np.argmax(scores_test, axis=1)
       accuracy = np.mean(predictions_test == y_test)
       print('[Epoch %d] accuracy=%f' % (e, accuracy))
   
   
   # ### Try 2
   
   # In[7]:
   
   
   random.seed(_seed)
   mx.random.seed(_seed)
   np.random.seed(_seed)
   
   
   # In[8]:
   
   
   net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, classes=_nclasses, ctx=_ctx_list)
   
   loss = gluon.loss.SoftmaxCrossEntropyLoss()
   
   
   # In[9]:
   
   
   net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), ctx=_ctx_list, force_reinit=True)
   
   trainer = gluon.Trainer(net.collect_params(), _optimizer,
                           optimizer_params=dict(learning_rate=_learning_rate, momentum=_momentum,
                                                 wd=_wd),
                           kvstore='device' if len(_ctx_list) > 0 else 'local')
   
   train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size)
   for e in range(epochs):
       train_data.reset()
       for batch in train_data:
           cur_contexts = _ctx_list
           if batch.data[0].shape[0] < len(_ctx_list):
               cur_contexts = cur_contexts[:batch.data[0].shape[0]]
           data = gluon.utils.split_and_load(batch.data[0], ctx_list=cur_contexts, batch_axis=0, even_split=False)
           label = gluon.utils.split_and_load(batch.label[0], ctx_list=cur_contexts, batch_axis=0, even_split=False)
           Ls = []
           with autograd.record():  # Start recording the derivatives
               for x_cur, y_cur in zip(data, label):
                   L = loss(net(x_cur), y_cur)
                   # store the loss and do backward after we have done forward
                   # on all GPUs for better speed on multiple GPUs.
                   Ls.append(L)
               for L in Ls:
                   L.backward()
           trainer.step(batch.data[0].shape[0])
           
       scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0])
       predictions_test = np.argmax(scores_test, axis=1)
       accuracy = np.mean(predictions_test == y_test)
       print('[Epoch %d] accuracy=%f' % (e, accuracy))
   ```
   
   Output:
   ```
   [Epoch 0] accuracy=0.346900
   [Epoch 0] accuracy=0.281900
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services