You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/11/11 02:24:30 UTC
[GitHub] thirdwing closed pull request #8121: [R] Initializer fix and adjustments to RNN API

thirdwing closed pull request #8121: [R] Initializer fix and adjustments to RNN API
URL: https://github.com/apache/incubator-mxnet/pull/8121
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/R-package/R/gru.R b/R-package/R/gru.R
deleted file mode 100644
index d2ffd9a414..0000000000
--- a/R-package/R/gru.R
+++ /dev/null
@@ -1,355 +0,0 @@
-# gru cell symbol
-gru <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
-    if (dropout > 0)
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$gates.i2h.weight,
-                                    bias=param$gates.i2h.bias,
-                                    num.hidden=num.hidden * 2,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$gates.h2h.weight,
-                                    bias=param$gates.h2h.bias,
-                                    num.hidden=num.hidden * 2,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
-    gates <- i2h + h2h
-    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=2,
-                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
-    update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
-    reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type="sigmoid")
-
-    htrans.i2h <- mx.symbol.FullyConnected(data=indata,
-                                           weight=param$trans.i2h.weight,
-                                           bias=param$trans.i2h.bias,
-                                           num.hidden=num.hidden,
-                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
-    h.after.reset <- prev.state$h * reset.gate
-    htrans.h2h <- mx.symbol.FullyConnected(data=h.after.reset,
-                                           weight=param$trans.h2h.weight,
-                                           bias=param$trans.h2h.bias,
-                                           num.hidden=num.hidden,
-                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
-    h.trans <- htrans.i2h + htrans.h2h
-    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh")
-    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
-    return (list(h=next.h))
-}
-
-# unrolled gru network
-gru.unroll <- function(num.gru.layer, seq.len, input.size,
-                       num.hidden, num.embed, num.label, dropout=0) {
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-    param.cells <- lapply(1:num.gru.layer, function(i) {
-        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
-                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
-                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
-                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
-                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
-                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
-                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
-                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.gru.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    label <- mx.symbol.Variable("label")
-    data <- mx.symbol.Variable("data")
-    embed <- mx.symbol.Embedding(data=data, input.dim=input.size,
-                                 weight=embed.weight, output.dim=num.embed, name='embed')
-    wordvec <- mx.symbol.SliceChannel(data=embed, num.outputs=seq.len, squeeze.axis=1)
-
-    last.hidden <- list()
-    for (seqidx in 1:seq.len) {
-        hidden <- wordvec[[seqidx]]
-        # stack GRU
-        for (i in 1:num.gru.layer) {
-            dp <- ifelse(i==1, 0, dropout)
-            next.state <- gru(num.hidden, indata=hidden,
-                              prev.state=last.states[[i]],
-                              param=param.cells[[i]],
-                              seqidx=seqidx, layeridx=i, 
-                              dropout=dp)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0)
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-
-    label <- mx.symbol.transpose(data=label)
-    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
-
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    return (loss.all)
-}
-
-# gru inference model symbol
-gru.inference.symbol <- function(num.gru.layer, seq.len, input.size,
-                                 num.hidden, num.embed, num.label, dropout=0) {
-    seqidx <- 1
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.gru.layer, function(i) {
-        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
-                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
-                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
-                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
-                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
-                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
-                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
-                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.gru.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    data <- mx.symbol.Variable("data")
-    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                  weight=embed.weight, output_dim=num.embed, name="embed")
-
-    # stack GRU
-    for (i in 1:num.gru.layer) {
-        dp <- ifelse(i==1, 0, dropout)
-        next.state <- gru(num.hidden, indata=hidden,
-                          prev.state=last.states[[i]],
-                          param=param.cells[[i]],
-                          seqidx=seqidx, layeridx=i, 
-                          dropout=dp)
-        hidden <- next.state$h
-        last.states[[i]] <- next.state
-    }
-    # decoder
-    if (dropout > 0)
-        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-
-    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
-                                   weight=cls.weight, bias=cls.bias, name='pred')
-    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
-    unpack.h <- lapply(1:num.gru.layer, function(i) {
-        state <- last.states[[i]]
-        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
-        return (state.h)
-    })
-
-    list.all <- c(sm, unpack.h)
-    return (mx.symbol.Group(list.all))
-}
-
-#' Training GRU Unrolled Model
-#'
-#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
-#'      The Training set.
-#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
-#'      The validation set used for validation evaluation during the progress.
-#' @param num.gru.layer integer
-#'      The number of the layer of gru.
-#' @param seq.len integer
-#'      The length of the input sequence.
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer
-#'      The batch size used for R array training.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param ctx mx.context, optional
-#'      The device used to perform training.
-#' @param num.round integer, default=10
-#'      The number of iterations over training data to train the model.
-#' @param update.period integer, default=1
-#'      The number of iterations to update parameters during training period.
-#' @param initializer initializer object. default=mx.init.uniform(0.01)
-#'      The initialization scheme for parameters.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param optimizer string, default="sgd"
-#'      The optimization method.
-#' @param ... other parameters passing to \code{mx.gru}/.
-#' @return model A trained gru unrolled model.
-#'
-#' @export
-mx.gru <- function( train.data, eval.data=NULL,
-                    num.gru.layer, seq.len,
-                    num.hidden, num.embed, num.label,
-                    batch.size, input.size,
-                    ctx=mx.ctx.default(),
-                    num.round=10, update.period=1,
-                    initializer=mx.init.uniform(0.01),
-                    dropout=0, optimizer='sgd',
-                    ...) {
-    # check data and change data into iterator
-    train.data <- check.data(train.data, batch.size, TRUE)
-    eval.data <- check.data(eval.data, batch.size, FALSE)
-
-    # get unrolled gru symbol
-    rnn.sym <- gru.unroll( num.gru.layer=num.gru.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout)
-
-    init.states.name <- lapply(1:num.gru.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-
-    # set up gru model
-    model <- setup.rnn.model(rnn.sym=rnn.sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.gru.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=initializer,
-                             dropout=dropout)
-
-    # train gru model
-    model <- train.rnn( model, train.data, eval.data,
-                        num.round=num.round,
-                        update.period=update.period,
-                        ctx=ctx,
-                        init.states.name=init.states.name,
-                        ...)
-    # change model into MXFeedForwardModel
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
-    return(structure(model, class="MXFeedForwardModel"))
-}
-
-#' Create a GRU Inference Model
-#'
-#' @param num.gru.layer integer
-#'      The number of the layer of gru.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer, default=1
-#'      The batch size used for R array training.
-#' @param arg.params list
-#'      The batch size used for R array training.
-#' @param ctx mx.context, optional
-#'      Model parameter, list of name to NDArray of net's weights.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
-#'      A gru inference model.
-#'
-#' @export
-mx.gru.inference <- function(num.gru.layer,
-                             input.size,
-                             num.hidden,
-                             num.embed,
-                             num.label,
-                             batch.size=1,
-                             arg.params,
-                             ctx=mx.cpu(),
-                             dropout=0.) {
-    sym <- gru.inference.symbol(num.gru.layer=num.gru.layer,
-                                 input.size=input.size,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=num.label,
-                                 dropout=dropout)
-
-    init.states.name <- lapply(1:num.gru.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-
-    seq.len <- 1
-    # set up gru model
-    model <- setup.rnn.model(rnn.sym=sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.gru.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=mx.init.uniform(0.01),
-                             dropout=dropout)
-    arg.names <- names(model$rnn.exec$ref.arg.arrays)
-    for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-            rnn.input <- list()
-            rnn.input[[k]] <- arg.params[[k]]
-            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-    }
-    init.states <- list()
-    for (i in 1:num.gru.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-
-    return (model)
-}
-
-#' Using forward function to predict in gru inference model
-#'
-#' @param model gru model
-#'      A gru inference model
-#' @param input.data, array.matrix
-#'      The input data for forward function
-#' @param new.seq boolean, default=FALSE
-#'      Whether the input is the start of a new sequence
-#'
-#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
-#'
-#' @export
-mx.gru.forward <- function(model, input.data, new.seq=FALSE) {
-    if (new.seq == TRUE) {
-        init.states <- list()
-        for (i in 1:model$num.rnn.layer) {
-            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-        }
-        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    }
-    dim(input.data) <- c(model$batch.size)
-    data <- list(data=mx.nd.array(input.data))
-    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
-    mx.exec.forward(model$rnn.exec, is.train=FALSE)
-    init.states <- list()
-    for (i in 1:model$num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
-    return (list(prob=prob, model=model))
-}
-
diff --git a/R-package/R/initializer.R b/R-package/R/initializer.R
index 7a1ffb2b18..9f5e75be91 100644
--- a/R-package/R/initializer.R
+++ b/R-package/R/initializer.R
@@ -4,11 +4,11 @@
 #' @param shape the shape of the array to be generated.
 #'
 mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
-  if (endsWith(name, "bias")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "gamma")) return (mx.nd.ones(shape, ctx))
-  if (endsWith(name, "beta")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "moving_var")) return (mx.nd.ones(shape, ctx))
+  if (endsWith(name, "bias")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "gamma")) return (mx.nd.ones(shape))
+  if (endsWith(name, "beta")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "moving_var")) return (mx.nd.ones(shape))
   if (allow.unknown) return(NULL)
   stop(paste("Unkown initialization pattern for ", name))
 }
@@ -21,9 +21,9 @@ mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
 mx.init.uniform <- function(scale) {
   function(name, shape, ctx, allow.unknown=FALSE) {
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-    return (mx.runif(shape, -scale, scale, ctx))
+    return (mx.nd.random.uniform(low = -scale, high = scale, shape = shape))
   }
 }
 
@@ -35,9 +35,9 @@ mx.init.uniform <- function(scale) {
 mx.init.normal <- function(sd) {
   function(name, shape, ctx, allow.unknown=FALSE) {
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-    return (mx.rnorm(shape, 0, sd, ctx))
+    return (mx.nd.random.normal(loc = 0, scale = sd, shape = shape))
   }
 }
 
@@ -56,9 +56,9 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
                            magnitude = 3){
   function(name, shape, ctx, allow.unknown = FALSE){
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-
+    
     fan_out = shape[length(shape)]
     fan_in  = prod(shape[-length(shape)])
     factor_val  = 1
@@ -71,13 +71,13 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
     } else {
       stop("Not supported factor type. See usage of function mx.init.Xavier")
     }
-
+    
     scale = sqrt(magnitude / factor_val)
-
+    
     if (rnd_type == "uniform"){
-      return(mx.runif(shape, -scale, scale, ctx))
+      return(mx.nd.random.uniform(low = -scale, high = scale, shape = shape))
     } else if (rnd_type == "gaussian"){
-      return(mx.rnorm(shape, 0, scale, ctx))
+      return(mx.nd.random.normal(loc = 0, scale = scale, shape = shape))
     } else {
       stop("Not supported random type. See usage of function mx.init.Xavier")
     }
@@ -92,7 +92,7 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
 #' @param ctx mx.context The context of the weights
 #' @param skip.unknown Whether skip the unknown weight types
 #' @export
-mx.init.create <- function(initializer, shape.array, ctx, skip.unknown=TRUE) {
+mx.init.create <- function(initializer, shape.array, ctx=NULL, skip.unknown=TRUE) {
   if (length(shape.array) == 0) return(list())
   names = names(shape.array)
   ret <- lapply(1 : length(names), function(i) {
diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R
deleted file mode 100644
index 622388993c..0000000000
--- a/R-package/R/lstm.R
+++ /dev/null
@@ -1,388 +0,0 @@
-# lstm cell symbol
-lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
-    if (dropout > 0)
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$i2h.weight,
-                                    bias=param$i2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$h2h.weight,
-                                    bias=param$h2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
-    gates <- i2h + h2h
-    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4,
-                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
-
-    in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
-    in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh")
-    forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid")
-    out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid")
-    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
-    next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
-
-    return (list(c=next.c, h=next.h))
-}
-
-# unrolled lstm network
-lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
-                        num.hidden, num.embed, num.label, dropout=0.) {
-
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.lstm.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.lstm.layer, function(i) {
-        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
-                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    label <- mx.symbol.Variable("label")
-    data <- mx.symbol.Variable("data")
-    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                 weight=embed.weight, output_dim=num.embed, name="embed")
-    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
-
-    last.hidden <- list()
-    for (seqidx in 1:seq.len) {
-        hidden <- wordvec[[seqidx]]
-        # stack lstm
-        for (i in 1:num.lstm.layer) {
-            dp <- ifelse(i==1, 0, dropout)
-            next.state <- lstm(num.hidden, indata=hidden,
-                               prev.state=last.states[[i]],
-                               param=param.cells[[i]],
-                               seqidx=seqidx, layeridx=i,
-                               dropout=dp)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0)
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-
-    label <- mx.symbol.transpose(data=label)
-    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
-
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    return (loss.all)
-}
-
-# lstm inference model symbol
-lstm.inference.symbol <- function(num.lstm.layer, input.size,
-                                  num.hidden, num.embed, num.label, dropout=0.) {
-    seqidx <- 0
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.lstm.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.lstm.layer, function(i) {
-        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
-                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    data <- mx.symbol.Variable("data")
-    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                  weight=embed.weight, output_dim=num.embed, name="embed")
-
-    # stack lstm
-    for (i in 1:num.lstm.layer) {
-        dp <- ifelse(i==1, 0, dropout)
-        next.state <- lstm(num.hidden, indata=hidden,
-                           prev.state=last.states[[i]],
-                           param=param.cells[[i]],
-                           seqidx=seqidx, layeridx=i,
-                           dropout=dp)
-        hidden <- next.state$h
-        last.states[[i]] <- next.state
-    }
-    # decoder
-    if (dropout > 0)
-        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-
-    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
-                                   weight=cls.weight, bias=cls.bias, name='pred')
-    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
-    unpack.c <- lapply(1:num.lstm.layer, function(i) {
-        state <- last.states[[i]]
-        state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c"))
-        return (state.c)
-    })
-    unpack.h <- lapply(1:num.lstm.layer, function(i) {
-        state <- last.states[[i]]
-        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
-        return (state.h)
-    })
-
-    list.all <- c(sm, unpack.c, unpack.h)
-    return (mx.symbol.Group(list.all))
-}
-
-
-
-#' Training LSTM Unrolled Model
-#'
-#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
-#'      The Training set.
-#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
-#'      The validation set used for validation evaluation during the progress.
-#' @param num.lstm.layer integer
-#'      The number of the layer of lstm.
-#' @param seq.len integer
-#'      The length of the input sequence.
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer
-#'      The batch size used for R array training.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param ctx mx.context, optional
-#'      The device used to perform training.
-#' @param num.round integer, default=10
-#'      The number of iterations over training data to train the model.
-#' @param update.period integer, default=1
-#'      The number of iterations to update parameters during training period.
-#' @param initializer initializer object. default=mx.init.uniform(0.01)
-#'      The initialization scheme for parameters.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param optimizer string, default="sgd"
-#'      The optimization method.
-#' @param epoch.end.callback function, optional
-#'     The callback when iteration ends.
-#' @param batch.end.callback function, optional
-#'     The callback when one mini-batch iteration ends.
-#' @param ... other parameters passing to \code{mx.lstm}/.
-#' @return model A trained lstm unrolled model.
-#'
-#' @export
-mx.lstm <- function(train.data, eval.data=NULL,
-                    num.lstm.layer, seq.len,
-                    num.hidden, num.embed, num.label,
-                    batch.size, input.size,
-                    ctx=mx.ctx.default(),
-                    num.round=10, update.period=1,
-                    initializer=mx.init.uniform(0.01),
-                    dropout=0, optimizer='sgd',
-                    epoch.end.callback=NULL, batch.end.callback=NULL,
-                    model,
-                    arg.params,
-                    ...) {
-    # check data and change data into iterator
-    train.data <- check.data(train.data, batch.size, TRUE)
-    eval.data <- check.data(eval.data, batch.size, FALSE)
-    
-    
-
-    # get unrolled lstm symbol
-    if(missing(model)){
-        rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout)
-    } else {
-      rnn.sym=model$symbol
-    }
-
-    init.states.c <- lapply(1:num.lstm.layer, function(i) {
-        state.c <- paste0("l", i, ".init.c")
-        return (state.c)
-    })
-    init.states.h <- lapply(1:num.lstm.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-    init.states.name <- c(init.states.c, init.states.h)
-
-    # set up lstm model
-    model <- setup.rnn.model(rnn.sym=rnn.sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.lstm.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=initializer,
-                             dropout=dropout)
-    # restore states
-    if (!missing(arg.params)){
-      arg.names <- names(model$rnn.exec$ref.arg.arrays)
-      for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-          rnn.input <- list()
-          rnn.input[[k]] <- arg.params[[k]]
-          mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-      }
-    }
-
-    # train lstm model
-    model <- train.rnn( model, train.data, eval.data,
-                        num.round=num.round,
-                        update.period=update.period,
-                        ctx=ctx,
-                        init.states.name=init.states.name,
-                        epoch.end.callback=epoch.end.callback, 
-                        batch.end.callback=batch.end.callback,
-                        ...)
-    # change model into MXFeedForwardModel
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
-    return(structure(model, class="MXFeedForwardModel"))
-}
-
-
-#' Create a LSTM Inference Model
-#'
-#' @param num.lstm.layer integer
-#'      The number of the layer of lstm.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer, default=1
-#'      The batch size used for R array training.
-#' @param arg.params list
-#'      The batch size used for R array training.
-#' @param ctx mx.context, optional
-#'      Model parameter, list of name to NDArray of net's weights.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
-#'      A lstm inference model.
-#'
-#' @export
-mx.lstm.inference <- function(num.lstm.layer,
-                              input.size,
-                              num.hidden,
-                              num.embed,
-                              num.label,
-                              batch.size=1,
-                              arg.params,
-                              ctx=mx.cpu(),
-                              dropout=0.) {
-    sym <- lstm.inference.symbol(num.lstm.layer=num.lstm.layer,
-                                 input.size=input.size,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=num.label,
-                                 dropout=dropout)
-
-    init.states.c <- lapply(1:num.lstm.layer, function(i) {
-        state.c <- paste0("l", i, ".init.c")
-        return (state.c)
-    })
-    init.states.h <- lapply(1:num.lstm.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-    init.states.name <- c(init.states.c, init.states.h)
-
-    seq.len <- 1
-    # set up lstm model
-    model <- setup.rnn.model(rnn.sym=sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.lstm.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=mx.init.uniform(0.01),
-                             dropout=dropout)
-    arg.names <- names(model$rnn.exec$ref.arg.arrays)
-    for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-            rnn.input <- list()
-            rnn.input[[k]] <- arg.params[[k]]
-            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-    }
-    init.states <- list()
-    for (i in 1:num.lstm.layer) {
-        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-
-    return (model)
-}
-
-#' Using forward function to predict in lstm inference model
-#'
-#' @param model lstm model
-#'      A Lstm inference model
-#' @param input.data, array.matrix
-#'      The input data for forward function
-#' @param new.seq boolean, default=FALSE
-#'      Whether the input is the start of a new sequence
-#'
-#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
-#'
-#' @export
-mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
-    if (new.seq == TRUE) {
-        init.states <- list()
-        for (i in 1:model$num.rnn.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-        }
-        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    }
-    dim(input.data) <- c(model$batch.size)
-    data <- list(data=mx.nd.array(input.data))
-    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
-    mx.exec.forward(model$rnn.exec, is.train=FALSE)
-    init.states <- list()
-    for (i in 1:model$num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]]
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
-    return (list(prob=prob, model=model))
-}
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
index 11e5ef563c..2c099f0802 100644
--- a/R-package/R/rnn.graph.R
+++ b/R-package/R/rnn.graph.R
@@ -5,32 +5,33 @@
 #' @param cell.type Type of RNN cell: either gru or lstm
 #' @param num.rnn.layer int, number of stacked layers
 #' @param num.hidden int, size of the state in each RNN layer
-#' @param num.embed  int, dimension of the embedding vectors
-#' @param num.label int, number of categories in labels
-#' @param input.size int, number of levels in the data
+#' @param num.embed  int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num.decode int, number of output variables in the decoding layer
+#' @param input.size int, number of levels in the data - only used for embedding
 #' @param dropout
 #' 
 #' @export
 rnn.graph <- function(num.rnn.layer, 
-                      input.size,
-                      num.embed, 
+                      input.size = NULL,
+                      num.embed = NULL, 
                       num.hidden,
-                      num.label,
+                      num.decode,
                       dropout = 0,
                       ignore_label = -1,
+                      loss_output = NULL, 
                       config,
                       cell.type,
                       masking = F,
                       output_last_state = F) {
   
   # define input arguments
-  label <- mx.symbol.Variable("label")
   data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
   seq.mask <- mx.symbol.Variable("seq.mask")
   
-  embed.weight <- mx.symbol.Variable("embed.weight")
-  rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
+  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
   
+  rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
   rnn.state <- mx.symbol.Variable("rnn.state")
   
   if (cell.type == "lstm") {
@@ -40,15 +41,17 @@ rnn.graph <- function(num.rnn.layer,
   cls.weight <- mx.symbol.Variable("cls.weight")
   cls.bias <- mx.symbol.Variable("cls.bias")
   
-  embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                               weight=embed.weight, output_dim=num.embed, name="embed")
+  if (!is.null(num.embed)){
+    data <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                weight=embed.weight, output_dim=num.embed, name="embed")
+  }
   
   # RNN cells
   if (cell.type == "lstm") {
-    rnn <- mx.symbol.RNN(data=embed, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
     
   } else {
-    rnn <- mx.symbol.RNN(data=embed, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
   }
   
   # Decode
@@ -57,30 +60,44 @@ rnn.graph <- function(num.rnn.layer,
     if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
       mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = F, name = "mask")
     
-    fc <- mx.symbol.FullyConnected(data=mask,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label,
-                                   name = "decode")
+    decode <- mx.symbol.FullyConnected(data=mask,
+                                       weight=cls.weight,
+                                       bias=cls.bias,
+                                       num.hidden=num.decode,
+                                       name = "decode")
     
-    loss <- mx.symbol.SoftmaxOutput(data=fc, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss")
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
     
   } else if (config=="one-to-one"){
     
     if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
       mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
-
-    reshape = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
+    
+    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
     
     decode <- mx.symbol.FullyConnected(data=reshape,
                                        weight=cls.weight,
                                        bias=cls.bias,
-                                       num.hidden=num.label,
+                                       num.hidden=num.decode,
                                        name = "decode")
     
     label <- mx.symbol.reshape(data=label, shape=c(-1), name = "label_reshape")
-    loss <- mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss")
     
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
   }
   return(loss)
 }
@@ -176,21 +193,24 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr
 #' unroll representation of RNN running on non CUDA device - under development
 #' 
 #' @export
-rnn.unroll <- function(num.rnn.layer, 
-                       seq.len, 
-                       input.size,
-                       num.embed, 
-                       num.hidden,
-                       num.label,
-                       dropout,
-                       ignore_label,
-                       init.state=NULL,
-                       config,
-                       cell.type="lstm", 
-                       masking = F, 
-                       output_last_state=F) {
-  
-  embed.weight <- mx.symbol.Variable("embed.weight")
+rnn.graph.unroll <- function(num.rnn.layer, 
+                             seq.len, 
+                             input.size = NULL,
+                             num.embed = NULL, 
+                             num.hidden,
+                             num.decode,
+                             dropout = 0,
+                             ignore_label = -1,
+                             loss_output = NULL, 
+                             init.state = NULL,
+                             config,
+                             cell.type = "lstm", 
+                             masking = F, 
+                             output_last_state = F) {
+  
+  
+  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
+  
   cls.weight <- mx.symbol.Variable("cls.weight")
   cls.bias <- mx.symbol.Variable("cls.bias")
   
@@ -215,19 +235,22 @@ rnn.unroll <- function(num.rnn.layer,
   })
   
   # embeding layer
-  label <- mx.symbol.Variable("label")
   data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  seq.mask <- mx.symbol.Variable("seq.mask")
   
-  embed <- mx.symbol.Embedding(data = data, input_dim = input.size,
-                               weight=embed.weight, output_dim = num.embed, name = "embed")
+  if (!is.null(num.embed)) {
+    data <- mx.symbol.Embedding(data = data, input_dim = input.size,
+                                weight=embed.weight, output_dim = num.embed, name = "embed")
+  }
   
-  embed <- mx.symbol.split(data = embed, axis = 0, num.outputs = seq.len, squeeze_axis = T)
+  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = T)
   
   last.hidden <- list()
   last.states <- list()
   
   for (seqidx in 1:seq.len) {
-    hidden <- embed[[seqidx]]
+    hidden <- data[[seqidx]]
     
     for (i in 1:num.rnn.layer) {
       
@@ -250,34 +273,57 @@ rnn.unroll <- function(num.rnn.layer,
       last.states[[i]] <- next.state
     }
     
-    # Decoding
-    if (config=="one-to-one"){
-      last.hidden <- c(last.hidden, hidden)
-    }
+    # Aggregate outputs from each timestep
+    last.hidden <- c(last.hidden, hidden)
   }
   
+  # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size
+  concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat")
+  concat <- mx.symbol.reshape(data = concat, shape = c(num.hidden, -1, seq.len), name = "rnn_reshape")
+  
   if (config=="seq-to-one"){
-    fc <- mx.symbol.FullyConnected(data = hidden,
-                                   weight = cls.weight,
-                                   bias = cls.bias,
-                                   num.hidden = num.label)
     
-    loss <- mx.symbol.SoftmaxOutput(data = fc, name="sm", label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label)
+    if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
+      mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = "mask")
+    
+    decode <- mx.symbol.FullyConnected(data = mask,
+                                       weight = cls.weight,
+                                       bias = cls.bias,
+                                       num.hidden = num.decode,
+                                       name = "decode")
+    
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
     
   } else if (config=="one-to-one"){
     
-    # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size
-    concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat")
+    if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
+      mask <- mx.symbol.identity(data = concat, name = "mask")
+    
+    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
     
-    decode <- mx.symbol.FullyConnected(data = concat,
+    decode <- mx.symbol.FullyConnected(data = mask,
                                        weight = cls.weight,
                                        bias = cls.bias,
-                                       num.hidden = num.label,
+                                       num.hidden = num.decode,
                                        name = "decode")
     
     label <- mx.symbol.reshape(data = label, shape = -1, name = "label_reshape")
-    loss <- mx.symbol.SoftmaxOutput(data = decode, name="sm", label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label)
     
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
   }
   return(loss)
 }
diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R
index aef90ad1c3..6d13de0af1 100644
--- a/R-package/R/viz.graph.R
+++ b/R-package/R/viz.graph.R
@@ -123,11 +123,14 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
     stringsAsFactors=F)
   edges_df$from<- id_dic[as.character(edges_df$from)]
   
-  nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", fillcolor=adjustcolor(nodes_df$color, alpha.f = 1))
-  edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black")
+  nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", 
+                                fillcolor=adjustcolor(nodes_df$color, alpha.f = 1), fontcolor = "black")
+  edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black", fontcolor = "black")
   
   if (!is.null(shape)){
-    edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes
+    if (is.list(shape)) {
+      edges_labels_raw<- symbol$get.internals()$infer.shape(shape)$out.shapes
+    } else edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes
     if (!is.null(edges_labels_raw)){
       edge_label_str <- function(x) paste0(x, collapse="X")
       edges_labels_raw<- sapply(edges_labels_raw, edge_label_str)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services