You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/05/09 08:43:31 UTC
[GitHub] [incubator-mxnet] Marcovaldong removed a comment on issue #14796:
The model gets stuck after several thousand batch_size
Marcovaldong removed a comment on issue #14796: The model gets stuck after several thousand batch_size
URL: https://github.com/apache/incubator-mxnet/issues/14796#issuecomment-487273345
@lanking520 The code is as follows:
```
from tqdm import tqdm
import argparse
import logging
import math
import os
import time
import collections
import kaldi_io
import mxnet as mx
import numpy as np
from mxnet import autograd, gluon
from mxnet.gluon import utils as gutils
from mxnet.gluon import contrib
from DataLoader import SequentialLoader, TokenAcc, phone
from DataLoader import SpectrogramDataset, AudioDataLoader, BucketingSampler
from DataLoader import _batchify_fn
from mxnet.gluon.data import DataLoader
from model import Transducer
parser = argparse.ArgumentParser(description='MXNet Autograd RNN/LSTM Acoustic Model on youdao.')
parser.add_argument('--train_manifest', metavar='DIR',
help='path to train manifest csv', default='data/manifest.huiting.pinyin.train') # 'data/train_manifest.csv')
parser.add_argument('--val_manifest', metavar='DIR',
help='path to validation manifest csv', default='data/manifest.huiting.pinyin.test') # 'data/val.csv')
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--num_workers', default=32, type=int, help='Number of workers used in data-loading')
parser.add_argument('--labels_path', default='labels', help='Contains all characters for transcription')
parser.add_argument('--window_size', default=.02, type=float, help='Window size for spectrogram in seconds')
parser.add_argument('--window_stride', default=.01, type=float, help='Window stride for spectrogram in seconds')
parser.add_argument('--window', default='hamming', help='Window type for spectrogram generation')
parser.add_argument('--noise_dir', default=None,
help='Directory to inject noise into audio. If default, noise Inject not added')
parser.add_argument('--noise_prob', default=0.4, help='Probability of noise being added per sample')
parser.add_argument('--noise_min', default=0.0,
help='Minimum noise level to sample from. (1.0 means all noise, not original signal)', type=float)
parser.add_argument('--noise_max', default=0.5,
help='Maximum noise levels to sample from. Maximum 1.0', type=float)
parser.add_argument('--lr', type=float, default=1e-3,
help='initial learning rate')
parser.add_argument('--epochs', type=int, default=200,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=1, metavar='N',
help='batch size')
parser.add_argument('--dropout', type=float, default=0.5,
help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--bi', default=True, action='store_true',
help='whether use bidirectional lstm')
parser.add_argument('--noise', type=float, default=0,
help='add gaussian noise to inputs')
parser.add_argument('--log-interval', type=int, default=50, # metavar='N',
help='report interval')
parser.add_argument('--out', type=str, default='exp/rnnt_lr1e-3',
help='path to save the final model')
parser.add_argument('--stdout', default=False, action='store_true')
parser.add_argument('--init', type=str, default='',
help='Initial am & pm parameters')
parser.add_argument('--begin_epoch', default=1, type=int, help='the epoch number from which to train')
parser.add_argument('--initam', type=str, default='',
help='Initial am parameters')
parser.add_argument('--initpm', type=str, default='',
help='Initial pm parameters')
parser.add_argument('--gradclip', type=float, default=0)
parser.add_argument('--schedule', default=True, help='whether to annealing the learning rate')
parser.add_argument('--tmp', default=1000, help='how many epoch to save params for preventing crash')
args = parser.parse_args()
os.makedirs(args.out, exist_ok=True)
with open(os.path.join(args.out, 'args'), 'w') as f:
f.write(str(args))
if args.stdout: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", level=logging.INFO)
else: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt="%m-%d %H:%M:%S", filename=os.path.join(args.out, 'train.log'), level=logging.INFO)
context = [mx.gpu(i) for i in [0, 1, 2, 3]]
# context = [mx.gpu(i) for i in [6, 7]]
# Dataset
audio_conf = dict(sample_rate=args.sample_rate,
window_size=args.window_size,
window_stride=args.window_stride,
window=args.window,
noise_dir=args.noise_dir,
noise_prob=args.noise_prob,
noise_levels=(args.noise_min, args.noise_max))
# trainset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.train_manifest, batch_size=args.batch_size,
# normalize=True, augment=False)
# devset = SequentialLoader(audio_conf=audio_conf, manifest_filepath=args.val_manifest, batch_size=args.batch_size,
# normalize=True, augment=False)
train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, normalize=True,
augment=False)
test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, normalize=True,
augment=False)
train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
train_loader = DataLoader(train_dataset, batchify_fn=_batchify_fn, num_workers=args.num_workers,
batch_sampler=train_sampler)
# train_loader = DataLoader(train_dataset, batch_size=args.batch_size, batchify_fn=_batchify_fn,
# num_workers=args.num_workers, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size//4, batchify_fn=_batchify_fn,
num_workers=args.num_workers)
###############################################################################
# Build the model
###############################################################################
model = Transducer(len(phone), 250, 3, args.dropout, bidirectional=args.bi)
# model = Transducer(input_size=320, vocab_size=len(phone), num_hidden=250, num_layers=3, dropout=args.dropout,
# blank=0, bidirectional=args.bi)
if args.init:
model.collect_params().load(args.init, context)
elif args.initam or args.initpm:
model.initialize(mx.init.Uniform(0.1), ctx=context)
# NOTE only use lstm layer
if args.initam:
model.collect_params('transducer0_rnnmodel0_lstm0').load(args.initam, context, True, True)
if args.initpm:
model.collect_params('transducer0_lstm0').load(args.initpm, context, True, True)
else:
model.initialize(mx.init.Uniform(0.1), ctx=context)
# trainer = gluon.Trainer(model.collect_params(), 'sgd',
# {'learning_rate': args.lr,
# 'momentum': 0.9})
trainer = gluon.Trainer(model.collect_params(), 'adam',
{'learning_rate': args.lr,
'beta1': 0.4})
def evaluate(model):
losses = []
for (data) in tqdm(test_loader):
xs, ys, xlen, ylen = data
gpu_xs = gutils.split_and_load(xs, ctx_list=context)
gpu_ys = gutils.split_and_load(ys, ctx_list=context)
gpu_xlens = gutils.split_and_load(xlen, ctx_list=context)
gpu_ylens = gutils.split_and_load(ylen, ctx_list=context)
# with autograd.record():
loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in
zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)]
losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus]))
return sum(losses) / len(test_dataset)
def train():
best_model = None
prev_loss = 1000
for epoch in range(args.begin_epoch, args.epochs+1):
print('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate))
logging.info('Train the model for the %d th epoch with learning rate %.2e' % (epoch, trainer.learning_rate))
losses = []
totl0 = 0
start_time = time.time()
tic = time.time()
for i, (xs, ys, xlen, ylen) in enumerate(train_loader, 1):
gpu_xs = gutils.split_and_load(xs, ctx_list=context)
gpu_ys = gutils.split_and_load(ys, ctx_list=context)
gpu_xlens = gutils.split_and_load(xlen, ctx_list=context)
gpu_ylens = gutils.split_and_load(ylen, ctx_list=context)
if args.noise > 0:
xs += mx.nd.normal(0, args.noise, xs.shape[-1], ctx=xs.context)
with autograd.record():
loss_gpus = [model(gpu_x, gpu_y, gpu_xlen, gpu_ylen) for gpu_x, gpu_y, gpu_xlen, gpu_ylen in
zip(gpu_xs, gpu_ys, gpu_xlens, gpu_ylens)]
for loss in loss_gpus:
loss.backward()
losses.append(sum([float(loss.sum().asscalar()) for loss in loss_gpus]))
# gradient clip
if args.gradclip > 0:
grads = [p.grad(context) for p in model.collect_params().values()]
gluon.utils.clip_global_norm(grads, args.gradclip)
trainer.step(args.batch_size) # , ignore_stale_grad=True)
# mx.nd.waitall()
totl0 += losses[-1]
if i % args.log_interval == 0:
l0 = totl0 / args.batch_size / args.log_interval
toc = time.time()
print("Epoch [%d / %d][%d / %d] loss %.2f time %.2f s" % (epoch, args.epochs, i,
len(train_loader), l0, toc - tic))
logging.info('[Epoch %d Batch %d] loss %.2f' % (epoch, i, l0))
totl0 = 0
tic = time.time()
if i % args.tmp == 0:
tmp_path = "{}/params_tmp_epoch{:03d}".format(args.out, epoch)
model.collect_params().save(tmp_path)
losses = sum(losses) / len(train_dataset)
val_l = evaluate(model)
print('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % (
epoch, time.time()-start_time, losses, val_l, trainer.learning_rate))
logging.info('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.2e' % (
epoch, time.time()-start_time, losses, val_l, trainer.learning_rate))
if val_l < prev_loss:
prev_loss = val_l
best_model = '{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}'.format(args.out, epoch, losses, val_l)
model.collect_params().save(best_model)
flag = 0
else:
model.collect_params().save('{}/params_epoch{:03d}_tr{:.2f}_cv{:.2f}_rejected'.format(args.out, epoch, losses, val_l))
model.collect_params().load(best_model, context)
if args.schedule:
trainer.set_learning_rate(trainer.learning_rate / 5)
flag = 1
if args.schedule and not flag:
trainer.set_learning_rate(trainer.learning_rate / 2)
if __name__ == '__main__':
train()
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
With regards,
Apache Git Services