You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/11 14:16:35 UTC
[GitHub] dbsxdbsx opened a new issue #9026: why is it so slow (MXNET0.12)even with NVIDIA V100 GPU?

dbsxdbsx opened a new issue #9026: why is it so slow (MXNET0.12)even with NVIDIA V100 GPU?
URL: https://github.com/apache/incubator-mxnet/issues/9026
 
 
   I test my py on AWS EC2 P3.2xlarge(GPU:V100), the .py is as follow:
   ```
   '''this is a py used to predict capts num ranging from 4 to 10~
   Therefore, the DataIter should predict capt examples of different size, even in a batch_size'''
   
   font_name = 'segoeuib.ttf'
   
   import sys
   
   sys.path.insert(0, "../../python")
   sys.path.append('../')
   sys.path.append('../../')
   import mxnet as mx
   import numpy as np
   import cv2, random
   from captcha.image import ImageCaptcha
   
   
   class OCRBatch(object):
       def __init__(self, data_names, data, label_names, label):
           self.data = data
           self.label = label
           self.data_names = data_names
           self.label_names = label_names
   
       @property
       def provide_data(self):
           return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
   
       @property
       def provide_label(self):
           return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
   
   
   def gen_rand(capt_num):
       buf = ""
       for i in range(capt_num):
           buf += str(random.randint(0, 9))
       return buf
   
   
   def get_label(capt_str, capt_max_num):
       a = [int(x) for x in capt_str]
       # for max 10 label, if the capt_num is less than 10, fulfill with -1
       min_pos = len(a)
       for i in range(min_pos, capt_max_num):
           a.append(11)  # -1
       return np.array(a)
   
   
   def gen_sample(captcha, width, height, capt_num):
       num = gen_rand(capt_num)
       img = captcha.generate(num)
       img = np.fromstring(img.getvalue(), dtype='uint8')
       img = cv2.imdecode(img, cv2.IMREAD_COLOR)
       img = cv2.resize(img, (width, height))
   
       img = np.multiply(img, 1 / 255.0)
       img = img.transpose(2, 0, 1)
       return (num, img)
   
   
   class OCRIter(mx.io.DataIter):
       def __init__(self, count, batch_size, height, width):
           super(OCRIter, self).__init__()
           self.captcha = ImageCaptcha(fonts=[font_name])
   
           self.batch_size = batch_size
           self.count = count
           self.height = height
           self.width = width
           self.provide_data = [('data', (batch_size, 3, height, width))]
           self.capt_max_num = 10
           self.provide_label = [('softmax_label', (self.batch_size, self.capt_max_num))]  # ori version
           # self.provide_label = [('softmax1_label', (self.batch_size,)),
           #                       ('softmax2_label', (self.batch_size,)),
           #                       # ('softmax3_label', (self.batch_size,)),
           #                       # ('softmax4_label', (self.batch_size,)),
           #                       ]
   
       def __iter__(self):
           for k in range(self.count / self.batch_size):
               data = []
               label = []  # ori version
               # label = [[], [], [], []]
               for i in range(self.batch_size):
                   capt_num_for_1_example = np.random.randint(4, 10, size=1)
                   capt_num, img = gen_sample(self.captcha, self.width, self.height, capt_num_for_1_example)
                   data.append(img)
                   label.append(get_label(capt_num, self.capt_max_num))
                   # num = [int(x) for x in num]
                   # for i in range(4):
                   #     label[i].append(num[i])
   
               data_all = [mx.nd.array(data)]
               label_all = [mx.nd.array(label)]
               data_names = ['data']
               label_names = ['softmax_label']
   
               data_batch = OCRBatch(data_names, data_all, label_names, label_all)  # ori version
               # data_batch = mx.io.DataBatch(data=data_all, label=label_all)
               yield data_batch
   
       def reset(self):
           pass
   
   
   def get_ocrnet():
       data = mx.symbol.Variable('data')
       label = mx.symbol.Variable('softmax_label')
       conv1 = mx.symbol.Convolution(data=data, kernel=(5, 5), num_filter=32)
       pool1 = mx.symbol.Pooling(data=conv1, pool_type="max", kernel=(2, 2), stride=(1, 1))
       relu1 = mx.symbol.Activation(data=pool1, act_type="relu")
   
       conv2 = mx.symbol.Convolution(data=relu1, kernel=(5, 5), num_filter=32)
       pool2 = mx.symbol.Pooling(data=conv2, pool_type="avg", kernel=(2, 2), stride=(1, 1))
       relu2 = mx.symbol.Activation(data=pool2, act_type="relu")
   
       conv3 = mx.symbol.Convolution(data=relu2, kernel=(3, 3), num_filter=32)
       pool3 = mx.symbol.Pooling(data=conv3, pool_type="avg", kernel=(2, 2), stride=(1, 1))
       relu3 = mx.symbol.Activation(data=pool3, act_type="relu")
   
       conv4 = mx.symbol.Convolution(data=relu3, kernel=(3, 3), num_filter=32)
       pool4 = mx.symbol.Pooling(data=conv4, pool_type="avg", kernel=(2, 2), stride=(1, 1))
       relu4 = mx.symbol.Activation(data=pool4, act_type="relu")
   
       flatten = mx.symbol.Flatten(data=relu4)
       fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=256)
       fc21 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc22 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc23 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc24 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc25 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc26 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc27 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc28 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc29 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
       fc210 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
   
       fc2 = mx.symbol.Concat(*[fc21, fc22, fc23, fc24, fc25, fc26, fc27, fc28, fc29, fc210], dim=0)
       label = mx.symbol.transpose(data=label)
       label = mx.symbol.Reshape(data=label, target_shape=(0,))
       return mx.symbol.SoftmaxOutput(data=fc2, label=label, name="softmax")
   
   
   def Accuracy(label, pred):
       """the old version, I just think the pos is calculated wrong~"""
       label = label.T.reshape((-1,))
       hit = 0
       total = 0
       for i in range(pred.shape[0] / capt_num):
           ok = True
           for j in range(capt_num):
               k = i * capt_num + j
               if np.argmax(pred[k]) != int(label[k]):
                   ok = False
                   break
           if ok:
               hit += 1
           total += 1
       return 1.0 * hit / total
   
   
   def Accuracy2(label, pred):
       """new  version made by me"""
       hit = 0
       total = 0
       batch_size = pred.shape[0] / capt_num
       for i in range(batch_size):
           ok = True
           for j in range(capt_num):
               k = j * batch_size + i
               if np.argmax(pred[k]) != int(label[i, j]):
                   ok = False
                   break
           if ok:
               hit += 1
           total += 1
       return 1.0 * hit / total
   
   
   import argparse
   
   
   def parse_args(description):
       parser = argparse.ArgumentParser(description=description)
       parser.add_argument('--batch_size', dest='batch_size', type=int, default=8)
       parser.add_argument('--train_exp_num', dest='train_exp_num', type=int, default=2000)
       parser.add_argument('--epoch_num', dest='epoch_num', type=int, default=50)
       parser.add_argument('--gpu_num', dest='gpu_num', type=int, default=1)
       parser.add_argument('--lr', dest='lr', type=float, default=0.00075)
   
       args = parser.parse_args()
       return args
   
   
   if __name__ == '__main__':
       import logging
   
       head = '%(asctime)-15s %(message)s'
       logging.basicConfig(level=logging.DEBUG, format=head)
   
       args = parse_args('train 4 to 10 capt in 1 net')
       network = get_ocrnet()
       batch_size = args.batch_size  # 50
       train_exp_num = args.train_exp_num  # 10000  # 50000
       test_exp_num = 1000
       epoch_num = args.epoch_num  # 2000
       gpu_num = args.gpu_num
       lr = args.lr
       devs = [mx.gpu(i) for i in range(gpu_num)]
       # model = mx.model.FeedForward(ctx=devs,
       #                              symbol=network,
       #                              num_epoch=epoch_num,
       #                              learning_rate=lr,
       #                              wd=0.00001,
       #                              initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
       #                              momentum=0.9)
       global capt_num
       capt_num = 10
       data_train = OCRIter(train_exp_num, batch_size, 30, 80)
       data_test = OCRIter(test_exp_num, batch_size, 30, 80)
   
       # model.fit(X=data_train,
       #           eval_data=data_test,
       #           eval_metric=[Accuracy, Accuracy2],
       #           batch_end_callback=mx.callback.Speedometer(batch_size, 50),
       #           epoch_end_callback=mx.callback.do_checkpoint(prefix='param', period=2))
   
   
   
   
       # new version
       lenet_model = mx.mod.Module(symbol=network, context=devs)
       # train with the same
       lenet_model.fit(data_train,
                       eval_data=data_test,
                       optimizer='sgd',
                       optimizer_params={'learning_rate': lr, 'momentum': 0.9, 'wd': 0.00001},
                       eval_metric=[Accuracy, Accuracy2],
                       initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
                       batch_end_callback=mx.callback.Speedometer(batch_size, 50),
                       epoch_end_callback=mx.callback.do_checkpoint(prefix='param', period=2),
                       num_epoch=epoch_num)
   
   
       # model.save("cnn-ocr")
   ```
   On my own host, win10, mx0.12, gpu:940M,  I got near 110 samples/seconds with default params,  but surprisingly, on p3.2xlarge, I got only 170 samples/seconds. In detail, with `watch -n 1 nvidia-smi`,  I found the volatile GPU utile is always near 0%, up t0 4%.  WHY???   Is that just because I got a custom DataIter?
   
   
   
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services