You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/05/09 08:43:31 UTC

[GitHub] [incubator-mxnet] Marcovaldong removed a comment on issue #14796: The model gets stuck after several thousand batch_size

Marcovaldong removed a comment on issue #14796: The model gets stuck after several thousand batch_size
URL: https://github.com/apache/incubator-mxnet/issues/14796#issuecomment-487273345
 
 
   @lanking520  The code is as follows: 
   
   ```
   from tqdm import tqdm
   import argparse
   import logging
   import math
   import os
   import time
   import collections
   import kaldi_io
   import mxnet as mx
   import numpy as np
   from mxnet import autograd, gluon
   from mxnet.gluon import utils as gutils
   from mxnet.gluon import contrib
   from DataLoader import SequentialLoader, TokenAcc, phone
   from DataLoader import SpectrogramDataset, AudioDataLoader, BucketingSampler
   from DataLoader import _batchify_fn
   from mxnet.gluon.data import DataLoader
   from model import Transducer
   
   parser = argparse.ArgumentParser(description='MXNet Autograd RNN/LSTM Acoustic Model on youdao.')
   parser.add_argument('--train_manifest', metavar='DIR',
                       help='path to train manifest csv', default='data/manifest.huiting.pinyin.train')  # 'data/train_manifest.csv')
   parser.add_argument('--val_manifest', metavar='DIR',
                       help='path to validation manifest csv', default='data/manifest.huiting.pinyin.test')  # 'data/val.csv')
   parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
   parser.add_argument('--num_workers', default=32, type=int, help='Number of workers used in data-loading')
   parser.add_argument('--labels_path', default='labels', help='Contains all characters for transcription')
   parser.add_argument('--window_size', default=.02, type=float, help='Window size for spectrogram in seconds')
   parser.add_argument('--window_stride', default=.01, type=float, help='Window stride for spectrogram in seconds')
   parser.add_argument('--window', default='hamming', help='Window type for spectrogram generation')
   parser.add_argument('--noise_dir', default=None,
                       help='Directory to inject noise into audio. If default, noise Inject not added')
   parser.add_argument('--noise_prob', default=0.4, help='Probability of noise being added per sample')
   parser.add_argument('--noise_min', default=0.0,
                       help='Minimum noise level to sample from. (1.0 means all noise, not original signal)', type=float)
   parser.add_argument('--noise_max', default=0.5,
                       help='Maximum noise levels to sample from. Maximum 1.0', type=float)
   
   parser.add_argument('--lr', type=float, default=1e-3,
                       help='initial learning rate')
   parser.add_argument('--epochs', type=int, default=200,
                       help='upper epoch limit')
   parser.add_argument('--batch_size', type=int, default=1, metavar='N',
                       help='batch size')
   parser.add_argument('--dropout', type=float, default=0.5,
                       help='dropout applied to layers (0 = no dropout)')
   parser.add_argument('--bi', default=True, action='store_true',
                       help='whether use bidirectional lstm')
   parser.add_argument('--noise', type=float, default=0, 
                       help='add gaussian noise to inputs')
   parser.add_argument('--log-interval', type=int, default=50, # metavar='N',
                       help='report interval')
   parser.add_argument('--out', type=str, default='exp/rnnt_lr1e-3',
                       help='path to save the final model')
   parser.add_argument('--stdout', default=False, action='store_true')
   parser.add_argument('--init', type=str, default='',
                       help='Initial am & pm parameters')
   parser.add_argument('--begin_epoch', default=1, type=int, help='the epoch number from which to train')
   parser.add_argument('--initam', type=str, default='',
                       help='Initial am parameters')
   parser.add_argument('--initpm', type=str, default='',
                       help='Initial pm parameters')
   parser.add_argument('--gradclip', type=float, default=0)
   parser.add_argument('--schedule', default=True, help='whether to annealing the learning rate')
   parser.add_argument('--tmp', default=1000, help='how many epoch to save params for preventing crash')
   args = parser.parse_args()
   
   os.makedirs(args.out, exist_ok=True)
   with open(os.path.join(args.out, 'args'), 'w') as f:
       f.write(str(args))
   if args.stdout: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", level=logging.INFO)
   else: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", filename=os.path.join(args.out, 'train.log'), level=logging.INFO)
   
   context = [mx.gpu(i) for i in [0, 1, 2, 3]]
   # context = [mx.gpu(i) for i in [6, 7]]
   # Dataset
   audio_conf = dict(sample_rate=args.sample_rate,
                         window_size=args.window_size,
                         window_stride=args.window_stride,
                         window=args.window,
                         noise_dir=args.noise_dir,
                         noise_prob=args.noise_prob,
                         noise_levels=(args.noise_min, args.noise_max))
   # trainset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.train_manifest, batch_size=args.batch_size,
   #                             normalize=True, augment=False)
   # devset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.val_manifest, batch_size=args.batch_size,
   #                           normalize=True, augment=False)
   train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, normalize=True,
                                      augment=False)
   test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, normalize=True,
                                     augment=False)
   train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
   train_loader = DataLoader(train_dataset, batchify_fn=_batchify_fn, num_workers=args.num_workers,
                             batch_sampler=train_sampler)
   # train_loader = DataLoader(train_dataset, batch_size=args.batch_size, batchify_fn=_batchify_fn,
   #                           num_workers=args.num_workers, shuffle=True)
   test_loader = DataLoader(test_dataset, batch_size=args.batch_size//4, batchify_fn=_batchify_fn,
                            num_workers=args.num_workers)
   
   ###############################################################################
   # Build the model
   ###############################################################################
   
   model = Transducer(len(phone), 250, 3, args.dropout, bidirectional=args.bi)
   # model = Transducer(input_size=320, vocab_size=len(phone), num_hidden=250, num_layers=3, dropout=args.dropout,
   #                    blank=0, bidirectional=args.bi)
   if args.init:
       model.collect_params().load(args.init, context)
   elif args.initam or args.initpm:
       model.initialize(mx.init.Uniform(0.1), ctx=context)
       # NOTE only use lstm layer
       if args.initam:
           model.collect_params('transducer0_rnnmodel0_lstm0').load(args.initam, context, True, True)
       if args.initpm:
           model.collect_params('transducer0_lstm0').load(args.initpm, context, True, True)
   else:
       model.initialize(mx.init.Uniform(0.1), ctx=context)
   
   # trainer = gluon.Trainer(model.collect_params(), 'sgd',
   #                         {'learning_rate': args.lr,
   #                          'momentum': 0.9})
   trainer = gluon.Trainer(model.collect_params(), 'adam', 
                             {'learning_rate': args.lr, 
                              'beta1': 0.4})
   
   def evaluate(model):
       losses = []
       for (data) in tqdm(test_loader):
           xs, ys, xlen, ylen = data
           gpu_xs = gutils.split_and_load(xs, ctx_list=context)
           gpu_ys = gutils.split_and_load(ys, ctx_list=context)
           gpu_xlens = gutils.split_and_load(xlen, ctx_list=context)
           gpu_ylens = gutils.split_and_load(ylen, ctx_list=context)
   
           # with autograd.record():
           loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in
                        zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)]
           losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus]))
       return sum(losses) / len(test_dataset)
   
   def train():
       best_model = None
       prev_loss = 1000
       for epoch in range(args.begin_epoch, args.epochs+1):
           print('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate))
           logging.info('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate))
           losses = []
           totl0 = 0
           start_time = time.time()
           tic = time.time()
           for i, (xs, ys, xlen, ylen) in enumerate(train_loader, 1):
               gpu_xs = gutils.split_and_load(xs, ctx_list=context)
               gpu_ys = gutils.split_and_load(ys, ctx_list=context)
               gpu_xlens = gutils.split_and_load(xlen, ctx_list=context)
               gpu_ylens = gutils.split_and_load(ylen, ctx_list=context)
               if args.noise > 0:
                   xs += mx.nd.normal(0, args.noise, xs.shape[-1], ctx=xs.context)
   
               with autograd.record():
                   loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in
                                zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)]
               for loss in loss_gpus:
                   loss.backward()
   
               losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus]))
               # gradient clip
               if args.gradclip > 0:
                   grads = [p.grad(context) for p in model.collect_params().values()]
                   gluon.utils.clip_global_norm(grads, args.gradclip)
   
               trainer.step(args.batch_size)  # , ignore_stale_grad=True)
               # mx.nd.waitall()
               totl0 += losses[-1]
   
               if i % args.log_interval == 0:
                   l0 = totl0 / args.batch_size / args.log_interval
                   toc = time.time()
                   print("Epoch [%d / %d][%d / %d] loss %.2f  time %.2f s" % (epoch, args.epochs, i,
                                                                              len(train_loader), l0, toc - tic))
                   logging.info('[Epoch %d Batch %d] loss %.2f' % (epoch, i, l0))
                   totl0 = 0
                   tic = time.time()
   
               if i % args.tmp == 0:
                   tmp_path = "{}/params_tmp_epoch{:03d}".format(args.out, epoch)
                   model.collect_params().save(tmp_path)
   
           losses = sum(losses) / len(train_dataset)
           val_l = evaluate(model)
   
           print('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % (
               epoch, time.time()-start_time, losses, val_l, trainer.learning_rate))
           logging.info('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % (
               epoch, time.time()-start_time, losses, val_l, trainer.learning_rate))
   
           if val_l < prev_loss:
               prev_loss = val_l
               best_model = '{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}'.format(args.out, epoch, losses, val_l)
               model.collect_params().save(best_model)
               flag = 0
           else:
               model.collect_params().save('{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}_rejected'.format(args.out, epoch, losses, val_l))
               model.collect_params().load(best_model, context)
               if args.schedule:
                   trainer.set_learning_rate(trainer.learning_rate / 5)
               flag = 1
           if args.schedule and not flag:
               trainer.set_learning_rate(trainer.learning_rate / 2)
   
   if __name__ == '__main__':
       train()
   
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services