You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/21 15:26:29 UTC
[GitHub] thirdwing closed pull request #9022: R RNN API fixes and Optimizer clip gradient on NDArray

thirdwing closed pull request #9022: R RNN API fixes and Optimizer clip gradient on NDArray
URL: https://github.com/apache/incubator-mxnet/pull/9022
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index 78a125ed51..3bf1d96dce 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -67,8 +67,8 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
         train.execs <- lapply(seq_len(ndevice), function(i) {
           s <- slices[[i]]
           mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]], 
-                                 arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
-                                 aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
+                         arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
+                         aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
         })
       } else {
         for (i in seq_len(ndevice)) {
diff --git a/R-package/R/mx.io.bucket.iter.R b/R-package/R/mx.io.bucket.iter.R
index 22ac1fae6e..4d07ec218b 100644
--- a/R-package/R/mx.io.bucket.iter.R
+++ b/R-package/R/mx.io.bucket.iter.R
@@ -22,7 +22,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                               buckets_nb <- length(bucket.names)
                               buckets_id <- seq_len(buckets_nb)
                               buckets.size <- sapply(.self$buckets, function(x) {
-                                dim(x$data)[length(dim(x$data)) - 1]
+                                tail(dim(x$data), 1)
                               })
                               .self$batch.per.bucket <- ceiling(buckets.size/.self$batch.size)
                               .self$last.batch.pad <- .self$batch.size - buckets.size %% .self$batch.size
@@ -36,23 +36,23 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                               
                               if (.self$shuffle) {
                                 set.seed(.self$seed)
-                                bucket_plan_names <- sample(rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
+                                bucket_plan_names <- sample(rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
                                 .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
                                                          FUN = cumsum)
                                 names(.self$bucket.plan) <- bucket_plan_names
-                                ### Return first BucketID at reset for initialization of the model
+                                # Return first BucketID at reset for initialization of the model
                                 .self$bucketID <- .self$bucket.plan[1]
                                 
                                 .self$buckets <- lapply(.self$buckets, function(x) {
-                                  shuffle_id <- sample.int(dim(x$data)[length(dim(x$data)) - 1])
+                                  shuffle_id <- sample(tail(dim(x$data), 1))
                                   if (length(dim(x$label)) == 0) {
-                                    list(data = x$data[shuffle_id, ], label = x$label[shuffle_id])
+                                    list(data = x$data[, shuffle_id], label = x$label[shuffle_id])
                                   } else {
-                                    list(data = x$data[shuffle_id, ], label = x$label[shuffle_id, ])
+                                    list(data = x$data[, shuffle_id], label = x$label[, shuffle_id])
                                   }
                                 })
                               } else {
-                                bucket_plan_names <- rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
+                                bucket_plan_names <- rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
                                 .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
                                                          FUN = cumsum)
                                 names(.self$bucket.plan) <- bucket_plan_names
@@ -60,25 +60,25 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
                             }, iter.next = function() {
                               .self$batch <- .self$batch + 1
                               .self$bucketID <- .self$bucket.plan[batch]
-                              return(.self$batch < .self$batch.per.epoch)
+                              return(.self$batch <= .self$batch.per.epoch)
                             }, value = function() {
                               # bucketID is a named integer: the integer indicates the batch id for the given
-                              # bucket (used to fetch appropriate samples within the bucket) the name is the a
+                              # bucket (used to fetch appropriate samples within the bucket) the name is a
                               # character containing the sequence length of the bucket (used to unroll the rnn
                               # to appropriate sequence length)
                               idx <- (.self$bucketID - 1) * (.self$batch.size) + seq_len(batch.size)
                               
-                              ### reuse first idx for padding
+                              # Reuse first idx for padding
                               if (bucketID == .self$batch.per.bucket[names(.self$bucketID)] & !.self$last.batch.pad[names(.self$bucketID)] == 0) {
                                 idx <- c(idx[seq_len(.self$batch.size - .self$last.batch.pad[names(.self$bucketID)])], seq_len(.self$last.batch.pad[names(.self$bucketID)]))
                               }
                               
-                              data <- .self$buckets[[names(.self$bucketID)]]$data[idx, , drop = FALSE]
-                              seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 1, sum)
+                              data <- .self$buckets[[names(.self$bucketID)]]$data[, idx, drop = F]
+                              seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 2, sum)
                               if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
                                 label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
                               } else {
-                                label <- .self$buckets[[names(.self$bucketID)]]$label[idx, , drop = FALSE]
+                                label <- .self$buckets[[names(.self$bucketID)]]$label[, idx, drop = F]
                               }
                               return(list(data = mx.nd.array(data), seq.mask = mx.nd.array(seq.mask), 
                                           label = mx.nd.array(label)))
@@ -103,4 +103,4 @@ mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffl
                               seed = 123) {
   return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element, 
                         shuffle = shuffle, seed = seed))
-}
+}
\ No newline at end of file
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 253f031ba4..ff88531706 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -43,11 +43,7 @@ mx.opt.sgd <- function(learning.rate,
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
-          grad_ctx <- ctx(grad)
-          grad <- as.array(grad)
-          grad <- pmax(grad, -1 * clip_gradient)
-          grad <- pmin(grad, clip_gradient)
-          grad <- mx.nd.array(grad, grad_ctx)
+        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
@@ -125,11 +121,7 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
-          grad_ctx <- ctx(grad)
-          grad <- as.array(grad)
-          grad <- pmax(grad, -1 * clip_gradient)
-          grad <- pmin(grad, clip_gradient)
-          grad <- mx.nd.array(grad, grad_ctx)
+        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
@@ -225,11 +217,7 @@ mx.opt.adam <- function(learning.rate=0.001,
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
-          grad_ctx <- ctx(grad)
-          grad <- as.array(grad)
-          grad <- pmax(grad, -1 * clip_gradient)
-          grad <- pmin(grad, clip_gradient)
-          grad <- mx.nd.array(grad, grad_ctx)
+        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
@@ -309,11 +297,7 @@ mx.opt.adagrad <- function(learning.rate=0.05,
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
-          grad_ctx <- ctx(grad)
-          grad <- as.array(grad)
-          grad <- pmax(grad, -1 * clip_gradient)
-          grad <- pmin(grad, clip_gradient)
-          grad <- mx.nd.array(grad, grad_ctx)
+        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
@@ -363,11 +347,7 @@ mx.opt.adadelta <- function(rho=0.90,
     grad <- grad * rescale.grad
     if (!is.null(clip_gradient)){
       if(clip_gradient >= 0){
-          grad_ctx <- ctx(grad)
-          grad <- as.array(grad)
-          grad <- pmax(grad, -1 * clip_gradient)
-          grad <- pmin(grad, clip_gradient)
-          grad <- mx.nd.array(grad, grad_ctx)
+        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
       } else {
         stop("Error: clip_gradient should be positive number.")
       }
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
index 5197882000..171508cfce 100644
--- a/R-package/R/rnn.graph.R
+++ b/R-package/R/rnn.graph.R
@@ -1,127 +1,114 @@
-# 
+
 #' Generate a RNN symbolic model - requires CUDA
 #' 
 #' @param config Either seq-to-one or one-to-one
-#' @param cell.type Type of RNN cell: either gru or lstm
-#' @param num.rnn.layer int, number of stacked layers
-#' @param num.hidden int, size of the state in each RNN layer
-#' @param num.embed  int, default = NULL - no embedding. Dimension of the embedding vectors
-#' @param num.decode int, number of output variables in the decoding layer
-#' @param input.size int, number of levels in the data - only used for embedding
+#' @param cell_type Type of RNN cell: either gru or lstm
+#' @param num_rnn_layer int, number of stacked layers
+#' @param num_hidden int, size of the state in each RNN layer
+#' @param num_embed  int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num_decode int, number of output variables in the decoding layer
+#' @param input_size int, number of levels in the data - only used for embedding
 #' @param dropout
 #' 
 #' @export
-rnn.graph <- function(num.rnn.layer, 
-                      input.size = NULL,
-                      num.embed = NULL, 
-                      num.hidden,
-                      num.decode,
-                      dropout = 0,
-                      ignore_label = -1,
-                      loss_output = NULL, 
-                      config,
-                      cell.type,
-                      masking = FALSE,
-                      output_last_state = FALSE) {
-  
-  # define input arguments
+rnn.graph <- function (num_rnn_layer, input_size = NULL, num_embed = NULL, 
+                       num_hidden, num_decode, dropout = 0, ignore_label = -1, bidirectional = F, 
+                       loss_output = NULL, config, cell_type, masking = F, output_last_state = F,
+                       rnn.state = NULL, rnn.state.cell = NULL, prefix = "") {
+  
   data <- mx.symbol.Variable("data")
   label <- mx.symbol.Variable("label")
   seq.mask <- mx.symbol.Variable("seq.mask")
-  
-  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
-  
+  if (!is.null(num_embed)) 
+    embed.weight <- mx.symbol.Variable("embed.weight")
   rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
-  rnn.state <- mx.symbol.Variable("rnn.state")
   
-  if (cell.type == "lstm") {
+  if (is.null(rnn.state)) rnn.state <- mx.symbol.Variable("rnn.state")
+  if (cell_type == "lstm" & is.null(rnn.state.cell)) {
     rnn.state.cell <- mx.symbol.Variable("rnn.state.cell")
   }
   
   cls.weight <- mx.symbol.Variable("cls.weight")
   cls.bias <- mx.symbol.Variable("cls.bias")
-  
-  if (!is.null(num.embed)){
-    data <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                weight=embed.weight, output_dim=num.embed, name="embed")
+  if (!is.null(num_embed)) {
+    data <- mx.symbol.Embedding(data = data, input_dim = input_size, 
+                                weight = embed.weight, output_dim = num_embed, name = "embed")
   }
   
-  # RNN cells
-  if (cell.type == "lstm") {
-    rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
-    
+  data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
+  
+  if (cell_type == "lstm") {
+    rnn <- mx.symbol.RNN(data = data, state = rnn.state, 
+                         state_cell = rnn.state.cell, parameters = rnn.params.weight, 
+                         state.size = num_hidden, num.layers = num_rnn_layer, 
+                         bidirectional = bidirectional, mode = cell_type, state.outputs = output_last_state, 
+                         p = dropout, name = paste0(prefix, "RNN"))
   } else {
-    rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    rnn <- mx.symbol.RNN(data = data, state = rnn.state, 
+                         parameters = rnn.params.weight, state.size = num_hidden, 
+                         num.layers = num_rnn_layer, bidirectional = bidirectional, mode = cell_type, 
+                         state.outputs = output_last_state, p = dropout, 
+                         name = paste0(prefix, "RNN"))
   }
   
-  # Decode
-  if (config=="seq-to-one") {
-    
-    if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, name = "mask") else
-      mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = FALSE, name = "mask")
-    
-    decode <- mx.symbol.FullyConnected(data=mask,
-                                       weight=cls.weight,
-                                       bias=cls.bias,
-                                       num.hidden=num.decode,
-                                       name = "decode")
+  if (config == "seq-to-one") {
+    if (masking) mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
+      mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = F, name = "mask")
     
     if (!is.null(loss_output)) {
-      loss <- switch(loss_output,
-                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
-                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
-                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
-                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
-      )
-    } else loss <- decode
-    
-  } else if (config=="one-to-one"){
+      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, name = "decode")
+      out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), 
+                    linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), 
+                    logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), 
+                    MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
+    }
+    else out <- mask
+  }
+  
+  else if (config == "one-to-one") {
     
-    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, value = 0, name = "mask") else
+    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
       mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
     
-    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
-    
-    decode <- mx.symbol.FullyConnected(data=reshape,
-                                       weight=cls.weight,
-                                       bias=cls.bias,
-                                       num.hidden=num.decode,
-                                       name = "decode")
-    
-    label <- mx.symbol.reshape(data=label, shape=c(-1), name = "label_reshape")
+    mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
     
     if (!is.null(loss_output)) {
-      loss <- switch(loss_output,
-                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
-                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
-                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
-                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
-      )
-    } else loss <- decode
+      
+      mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
+      label <- mx.symbol.reshape(data = label, shape = c(-1))
+      
+      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, 
+                                         flatten = TRUE, name = paste0(prefix, "decode"))
+      
+      out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), 
+                    linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), 
+                    logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), 
+                    MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
+    } else out <- mask
   }
-  return(loss)
+  return(out)
 }
 
-
 # LSTM cell symbol
-lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
-                                  num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
+lstm.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix = "") {
   
-  if (dropout > 0) 
-    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+  if (dropout > 0 && layeridx > 1) 
+    indata <- mx.symbol.Dropout(data = indata, p = dropout)
+  
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
+                                  num_hidden = num_hidden * 4, name = paste0(prefix, "t", seqidx, ".l", layeridx, ".i2h"))
   
   if (!is.null(prev.state)) {
     h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, 
-                                    bias = param$h2h.bias, num.hidden = num.hidden * 4, 
-                                    name = paste0("t", seqidx, ".l", layeridx, ".h2h"))
+                                    bias = param$h2h.bias, num_hidden = num_hidden * 4, 
+                                    name = paste0(prefix, "t", seqidx, ".l", layeridx, ".h2h"))
     gates <- i2h + h2h
   } else {
     gates <- i2h
   }
   
-  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = FALSE, 
-                                 name = paste0("t", seqidx, ".l", layeridx, ".slice"))
+  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
+                                 name = paste0(prefix, "t", seqidx, ".l", layeridx, ".slice"))
   
   in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
   in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
@@ -136,42 +123,48 @@ lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, d
   
   next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
   
-  return(list(c = next.c, h = next.h))
+  return(list(h = next.h, c = next.c))
 }
 
+
 # GRU cell symbol
-gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
-                                  bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, 
-                                  name = paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
+gru.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix)
+{
+  if (dropout > 0 && layeridx > 1) 
+    indata <- mx.symbol.Dropout(data = indata, p = dropout)
   
-  if (dropout > 0) 
-    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
+                                  bias = param$gates.i2h.bias, num_hidden = num_hidden * 2, 
+                                  name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.i2h"))
   
   if (!is.null(prev.state)) {
     h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, 
-                                    bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, 
-                                    name = paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
+                                    bias = param$gates.h2h.bias, num_hidden = num_hidden * 2, 
+                                    name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.h2h"))
     gates <- i2h + h2h
   } else {
     gates <- i2h
   }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = FALSE, 
-                                 name = paste0("t", seqidx, ".l", layeridx, ".split"))
+
+  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
+                                 name = paste0(prefix, "t", seqidx, ".l", layeridx, ".split"))
   
   update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
   reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
   
   htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
-                                         bias = param$trans.i2h.bias, num.hidden = num.hidden, 
-                                         name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
-
-  h.after.reset <- reset.gate * (if (is.null(prev.state)) 0 else prev.state$h)
-
-  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
-                                         bias = param$trans.h2h.bias, num.hidden = num.hidden,
-                                         name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
+                                         bias = param$trans.i2h.bias, num_hidden = num_hidden, 
+                                         name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.i2h"))
+  
+  if (is.null(prev.state)) {
+    h.after.reset <- reset.gate * 0
+  } else {
+    h.after.reset <- prev.state$h * reset.gate
+  }
+  
+  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
+                                         bias = param$trans.h2h.bias, num_hidden = num_hidden, 
+                                         name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.h2h"))
   
   h.trans <- htrans.i2h + htrans.h2h
   h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
@@ -185,86 +178,116 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr
   return(list(h = next.h))
 }
 
-# 
-#' unroll representation of RNN running on non CUDA device - under development
+
+#' unroll representation of RNN running on non CUDA device
+#' 
+#' @param config Either seq-to-one or one-to-one
+#' @param cell_type Type of RNN cell: either gru or lstm
+#' @param num_rnn_layer int, number of stacked layers
+#' @param seq_len int, number of time steps to unroll
+#' @param num_hidden int, size of the state in each RNN layer
+#' @param num_embed  int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num_decode int, number of output variables in the decoding layer
+#' @param input_size int, number of levels in the data - only used for embedding
+#' @param dropout 
 #' 
 #' @export
-rnn.graph.unroll <- function(num.rnn.layer, 
-                             seq.len, 
-                             input.size = NULL,
-                             num.embed = NULL, 
-                             num.hidden,
-                             num.decode,
+rnn.graph.unroll <- function(num_rnn_layer, 
+                             seq_len, 
+                             input_size = NULL,
+                             num_embed = NULL, 
+                             num_hidden,
+                             num_decode,
                              dropout = 0,
                              ignore_label = -1,
                              loss_output = NULL, 
                              init.state = NULL,
                              config,
-                             cell.type = "lstm", 
-                             masking = FALSE, 
-                             output_last_state = FALSE) {
-  
-  
-  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
+                             cell_type = "lstm", 
+                             masking = F, 
+                             output_last_state = F,
+                             prefix = "",
+                             data_name = "data",
+                             label_name = "label") {
+  
+  if (!is.null(num_embed)) embed.weight <- mx.symbol.Variable(paste0(prefix, "embed.weight"))
+  
+  # Initial state
+  if (is.null(init.state) & output_last_state) {
+    init.state <- lapply(1:num_rnn_layer, function(i) {
+      if (cell_type=="lstm") {
+        state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")),
+                      c = mx.symbol.Variable(paste0("init_", prefix, i, "_c")))
+      } else if (cell_type=="gru") {
+        state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")))
+      }
+      return (state)
+    })
+  }
   
-  cls.weight <- mx.symbol.Variable("cls.weight")
-  cls.bias <- mx.symbol.Variable("cls.bias")
+  cls.weight <- mx.symbol.Variable(paste0(prefix, "cls.weight"))
+  cls.bias <- mx.symbol.Variable(paste0(prefix, "cls.bias"))
   
-  param.cells <- lapply(seq_len(num.rnn.layer), function(i) {
+  param.cells <- lapply(1:num_rnn_layer, function(i) {
     
-    if (cell.type=="lstm"){
-      cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                   i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                   h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                   h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-    } else if (cell.type=="gru"){
-      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
-                   gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
-                   gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
-                   gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
-                   trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
-                   trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
-                   trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
-                   trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+    if (cell_type=="lstm") {
+      cell <- list(i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.weight")),
+                   i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.bias")),
+                   h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.weight")),
+                   h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.bias")))
+    } else if (cell_type=="gru") {
+      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.weight")),
+                   gates.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.bias")),
+                   gates.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.weight")),
+                   gates.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.bias")),
+                   trans.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.weight")),
+                   trans.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.bias")),
+                   trans.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.weight")),
+                   trans.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.bias")))
     }
     return (cell)
   })
   
   # embeding layer
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  seq.mask <- mx.symbol.Variable("seq.mask")
+  data <- mx.symbol.Variable(data_name)
+  label <- mx.symbol.Variable(label_name)
+  seq.mask <- mx.symbol.Variable(paste0(prefix, "seq.mask"))
   
-  if (!is.null(num.embed)) {
-    data <- mx.symbol.Embedding(data = data, input_dim = input.size,
-                                weight=embed.weight, output_dim = num.embed, name = "embed")
+  data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
+  
+  if (!is.null(num_embed)) {
+    data <- mx.symbol.Embedding(data = data, input_dim = input_size,
+                                weight=embed.weight, output_dim = num_embed, name = paste0(prefix, "embed"))
   }
   
-  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = TRUE)
+  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq_len, squeeze_axis = T)
   
   last.hidden <- list()
   last.states <- list()
   
-  for (seqidx in seq_len(seq.len)) {
+  for (seqidx in 1:seq_len) {
     hidden <- data[[seqidx]]
     
-    for (i in seq_len(num.rnn.layer)) {
+    for (i in 1:num_rnn_layer) {
       
-      if (seqidx==1) prev.state<- init.state[[i]] else prev.state <- last.states[[i]]
+      if (seqidx==1) prev.state <- init.state[[i]] else 
+        prev.state <- last.states[[i]]
       
-      if (cell.type=="lstm") {
+      if (cell_type=="lstm") {
         cell.symbol <- lstm.cell
-      } else if (cell.type=="gru"){
+      } else if (cell_type=="gru"){
         cell.symbol <- gru.cell
       }
       
-      next.state <- cell.symbol(num.hidden = num.hidden, 
+      next.state <- cell.symbol(num_hidden = num_hidden, 
                                 indata = hidden,
                                 prev.state = prev.state,
                                 param = param.cells[[i]],
                                 seqidx = seqidx, 
                                 layeridx = i,
-                                dropout = dropout)
+                                dropout = dropout,
+                                prefix = prefix)
+      
       hidden <- next.state$h
       last.states[[i]] <- next.state
     }
@@ -273,53 +296,61 @@ rnn.graph.unroll <- function(num.rnn.layer,
     last.hidden <- c(last.hidden, hidden)
   }
   
-  # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size
-  concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat")
-  concat <- mx.symbol.reshape(data = concat, shape = c(num.hidden, -1, seq.len), name = "rnn_reshape")
+  if (output_last_state) {
+    out.states = mx.symbol.Group(unlist(last.states))
+  }
   
-  if (config=="seq-to-one"){
-    
-    if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
-      mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = "mask")
+  # concat hidden units - concat seq_len blocks of dimension num_hidden x batch.size
+  concat <- mx.symbol.concat(data = last.hidden, num.args = seq_len, dim = 0, name = paste0(prefix, "concat"))
+  concat <- mx.symbol.reshape(data = concat, shape = c(num_hidden, -1, seq_len), name = paste0(prefix, "rnn_reshape"))
+  
+  if (config=="seq-to-one") {
     
-    decode <- mx.symbol.FullyConnected(data = mask,
-                                       weight = cls.weight,
-                                       bias = cls.bias,
-                                       num.hidden = num.decode,
-                                       name = "decode")
+    if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = paste0(prefix, "mask")) else
+      mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = paste0(prefix, "mask"))
     
     if (!is.null(loss_output)) {
-      loss <- switch(loss_output,
-                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
-                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
-                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
-                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      
+      decode <- mx.symbol.FullyConnected(data = mask,
+                                         weight = cls.weight,
+                                         bias = cls.bias,
+                                         num_hidden = num_decode,
+                                         name = paste0(prefix, "decode"))
+      
+      out <- switch(loss_output,
+                    softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = paste0(prefix, "loss")),
+                    linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+                    logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, paste0(prefix, name = "loss")),
+                    MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, paste0(prefix, name = "loss"))
       )
-    } else loss <- decode
+    } else out <- mask
     
   } else if (config=="one-to-one"){
     
-    if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
-      mask <- mx.symbol.identity(data = concat, name = "mask")
-    
-    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
-    
-    decode <- mx.symbol.FullyConnected(data = mask,
-                                       weight = cls.weight,
-                                       bias = cls.bias,
-                                       num.hidden = num.decode,
-                                       name = "decode")
+    if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = paste0(prefix, "mask")) else
+      mask <- mx.symbol.identity(data = concat, name = paste0(prefix, "mask"))
     
-    label <- mx.symbol.reshape(data = label, shape = -1, name = "label_reshape")
+    mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
     
     if (!is.null(loss_output)) {
-      loss <- switch(loss_output,
-                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
-                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
-                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
-                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      
+      mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
+      label <- mx.symbol.reshape(data = label, shape = c(-1))
+      
+      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, 
+                                         flatten = T, name = paste0(prefix, "decode"))
+      
+      out <- switch(loss_output,
+                    softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, 
+                                                      name = paste0(prefix, "loss")),
+                    linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+                    logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+                    MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = paste0(prefix, "loss"))
       )
-    } else loss <- decode
+    } else out <- mask
   }
-  return(loss)
+  
+  if (output_last_state) {
+    return(mx.symbol.Group(c(out, out.states)))
+  } else return(out)
 }
diff --git a/R-package/R/rnn.infer.R b/R-package/R/rnn.infer.R
index a22bae0eb3..588056f2eb 100644
--- a/R-package/R/rnn.infer.R
+++ b/R-package/R/rnn.infer.R
@@ -1,12 +1,12 @@
-# 
+
 #' Inference of RNN model
 #'
 #' @param infer.data Data iterator created by mx.io.bucket.iter
 #' @param model Model used for inference
-#' @param ctx The element to mask
+#' @param ctx
 #'
 #' @export
-mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
+mx.infer.rnn <- function(infer.data, model, ctx = mx.cpu()) {
   
   ### Initialise the iterator
   infer.data$reset()
@@ -52,7 +52,7 @@ mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
   arg.params.fix <- arguments.ini[arg.params.fix.names]
   
   # Grad request
-  grad.req <- rep.int("null", length(arguments))
+  grad.req <- rep("null", length(arguments))
   
   # Arg array order
   update_names <- c(input.names, arg.params.fix.names, arg.params.names)
@@ -88,11 +88,22 @@ mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
 }
 
 
-
-### inference for one-to-one models
-mx.infer.buckets.one <- function(infer.data, 
-                                 symbol, arg.params, aux.params, input.params = NULL, 
-                                 ctx = mx.cpu()) {
+#' Inference for one-to-one fusedRNN (CUDA) models
+#'
+#' @param infer.data Data iterator created by mx.io.bucket.iter
+#' @param symbol Symbol used for inference
+#' @param arg.params
+#' @param aux.params
+#' @param input.params
+#' @param ctx
+#'
+#' @export
+mx.infer.rnn.one <- function(infer.data, 
+                             symbol, 
+                             arg.params, 
+                             aux.params, 
+                             input.params = NULL, 
+                             ctx = mx.cpu()) {
   
   ### Initialise the iterator
   infer.data$reset()
@@ -138,7 +149,7 @@ mx.infer.buckets.one <- function(infer.data,
   aux.params <- aux.params
   
   # Grad request
-  grad.req <- rep.int("null", length(arguments))
+  grad.req <- rep("null", length(arguments))
   
   # Arg array order
   update_names <- c(input.names, arg.params.fix.names, arg.params.names)
@@ -146,8 +157,8 @@ mx.infer.buckets.one <- function(infer.data,
   
   # Initial binding
   execs <- mx.symbol.bind(symbol = symbol, 
-                                  arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx], 
-                                  aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+                          arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx], 
+                          aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
   
   # Initial input shapes - need to be adapted for multi-devices - divide highest
   # dimension by device nb
@@ -159,8 +170,8 @@ mx.infer.buckets.one <- function(infer.data,
     dlist <- infer.data$value()[input.names]
     
     execs <- mx.symbol.bind(symbol = symbol, 
-                                    arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx],
-                                    aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
+                            arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx],
+                            aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
     
     mx.exec.forward(execs, is.train = FALSE)
     
@@ -175,3 +186,85 @@ mx.infer.buckets.one <- function(infer.data,
   infer.data$reset()
   return(out)
 }
+
+
+#' Inference for one-to-one unroll models
+#'
+#' @param infer.data NDArray
+#' @param symbol Model used for inference
+#' @param num_hidden 
+#' @param arg.params
+#' @param aux.params
+#' @param init_states
+#' @param ctx
+#'
+#' @export
+mx.infer.rnn.one.unroll <- function(infer.data, 
+                                    symbol,
+                                    num_hidden, 
+                                    arg.params, 
+                                    aux.params, 
+                                    init_states = NULL, 
+                                    ctx = mx.cpu()) {
+  
+  if (is.null(ctx)) 
+    ctx <- mx.ctx.default()
+  if (is.mx.context(ctx)) {
+    ctx <- list(ctx)
+  }
+  
+  if (!is.list(ctx)) 
+    stop("ctx must be mx.context or list of mx.context")
+  
+  ndevice <- length(ctx)
+  
+  arguments <- symbol$arguments
+  input.names <- intersect(c("data", "label"), arguments)
+  
+  input.shape <- list("data" = dim(infer.data), "label" = dim(infer.data))
+  
+  # init_state_shapes
+  init_states_names <- arguments[startsWith(arguments, "init_")]
+  init_states_shapes = lapply(init_states_names, function(x) c(num_hidden, tail(input.shape[[1]], 1)))
+  names(init_states_shapes) <- init_states_names
+  
+  shapes <- symbol$infer.shape(c(input.shape, init_states_shapes))
+  
+  # initialize all arguments with zeros
+  arguments.ini <- lapply(shapes$arg.shapes, function(shape) {
+    mx.nd.zeros(shape = shape, ctx = mx.cpu())
+  })
+  
+  dlist <- list("data" = infer.data, "label" = infer.data)
+  
+  if (is.null(init_states)) {
+    init_states <- arguments.ini[init_states_names]
+  } else {
+    names(init_states) <- init_states_names
+  }
+  
+  # remove potential duplicates arguments - if inference on CUDA RNN symbol
+  arg.params <- arg.params[setdiff(names(arg.params), c(input.names, init_states_names))]
+  arg.params.names <- names(arg.params)
+  
+  # Aux params
+  aux.params <- aux.params
+  
+  # Grad request
+  grad.req <- rep("null", length(arguments))
+  
+  # Arg array order
+  update_names <- c(input.names, init_states_names, arg.params.names)
+  arg_update_idx <- match(arguments, update_names)
+  
+  # Bind to exec
+  execs <- mxnet:::mx.symbol.bind(symbol = symbol,
+                                  arg.arrays = c(dlist, init_states, arg.params)[arg_update_idx],
+                                  aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+  
+  mx.exec.forward(execs, is.train = FALSE)
+  
+  out <- lapply(execs$ref.outputs, function(out) mx.nd.copyto(out, mx.cpu()))
+  
+  return(out)
+}
diff --git a/example/rnn/bucket_R/aclImdb_lstm_classification.R b/example/rnn/bucket_R/aclImdb_lstm_classification.R
index bb5eaacf26..6c982c5cb4 100644
--- a/example/rnn/bucket_R/aclImdb_lstm_classification.R
+++ b/example/rnn/bucket_R/aclImdb_lstm_classification.R
@@ -1,10 +1,7 @@
 require("mxnet")
 
-source("mx.io.bucket.iter.R")
-source("rnn.train.R")
-
-corpus_bucketed_train <- readRDS(file = "corpus_bucketed_train_100_200_300_500_800_left.rds")
-corpus_bucketed_test <- readRDS(file = "corpus_bucketed_test_100_200_300_500_800_left.rds")
+corpus_bucketed_train <- readRDS(file = "data/corpus_bucketed_train.rds")
+corpus_bucketed_test <- readRDS(file = "data/corpus_bucketed_test.rds")
 
 vocab <- length(corpus_bucketed_test$dic)
 
@@ -23,20 +20,28 @@ mx.set.seed(0)
 optimizer <- mx.opt.create("adadelta", rho = 0.92, epsilon = 1e-06, wd = 2e-04, clip_gradient = NULL, 
   rescale.grad = 1/batch.size)
 
-model_sentiment_lstm <- mx.rnn.buckets(train.data = train.data, begin.round = 1, 
-  num.round = num.round, ctx = mx.cpu(), metric = mx.metric.accuracy, optimizer = optimizer, 
-  num.rnn.layer = 2, num.embed = 16, num.hidden = 24, num.label = 2, input.size = vocab, 
-  initializer = mx.init.Xavier(rnd_type = "gaussian", factor_type = "in", magnitude = 2), 
-  dropout = 0.25, config = "seq-to-one", batch.end.callback = mx.callback.log.train.metric(period = 50), 
-  verbose = TRUE)
+bucket_list <- unique(c(train.data$bucket.names, eval.data$bucket.names))
 
-mx.model.save(model_sentiment_lstm, prefix = "model_sentiment_lstm", iteration = num.round)
+symbol_buckets <- sapply(bucket_list, function(seq) {
+  rnn.graph(config = "seq-to-one", cell_type = "lstm", 
+            num_rnn_layer = 1, num_embed = 2, num_hidden = 6, 
+            num_decode = 2, input_size = vocab, dropout = 0.5, 
+            ignore_label = -1, loss_output = "softmax",
+            output_last_state = F, masking = T)
+})
 
-source("rnn.infer.R")
+model_sentiment_lstm <- mx.model.buckets(symbol = symbol_buckets,
+                          train.data = train.data, eval.data = eval.data,
+                          num.round = num.round, ctx = devices, verbose = FALSE,
+                          metric = mx.metric.accuracy, optimizer = optimizer,  
+                          initializer = initializer,
+                          batch.end.callback = NULL, 
+                          epoch.end.callback = epoch.end.callback)
 
+mx.model.save(model_sentiment_lstm, prefix = "model_sentiment_lstm", iteration = num.round)
 model <- mx.model.load("model_sentiment_lstm", iteration = num.round)
 
-pred <- mx.rnn.infer.buckets(infer_iter = eval.data, model, "seq-to-one", ctx = mx.cpu())
+pred <- mx.infer.rnn(infer.data = eval.data, model = model, ctx = mx.cpu())
 
 ypred <- max.col(t(as.array(pred)), tie = "first") - 1
 
diff --git a/example/rnn/bucket_R/data_preprocessing.R b/example/rnn/bucket_R/data_preprocessing.R
deleted file mode 100644
index c91e3fb5eb..0000000000
--- a/example/rnn/bucket_R/data_preprocessing.R
+++ /dev/null
@@ -1,170 +0,0 @@
-# download the IMDB dataset
-if (!file.exists("aclImdb_v1.tar.gz")) {
-  download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
-    "aclImdb_v1.tar.gz")
-  untar("aclImdb_v1.tar.gz")
-}
-
-# install required packages
-list.of.packages <- c("readr", "dplyr", "stringr", "stringi")
-new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
-if (length(new.packages)) install.packages(new.packages)
-
-require("readr")
-require("dplyr")
-require("stringr")
-require("stringi")
-
-negative_train_list <- list.files("./aclImdb/train/neg/", full.names = T)
-positive_train_list <- list.files("./aclImdb/train/pos/", full.names = T)
-
-negative_test_list <- list.files("./aclImdb/test/neg/", full.names = T)
-positive_test_list <- list.files("./aclImdb/test/pos/", full.names = T)
-
-file_import <- function(file_list) {
-  import <- sapply(file_list, read_file)
-  return(import)
-}
-
-negative_train_raw <- file_import(negative_train_list)
-positive_train_raw <- file_import(positive_train_list)
-
-negative_test_raw <- file_import(negative_test_list)
-positive_test_raw <- file_import(positive_test_list)
-
-train_raw <- c(negative_train_raw, positive_train_raw)
-test_raw <- c(negative_test_raw, positive_test_raw)
-
-saveRDS(train_raw, file = "train_raw.rds")
-saveRDS(test_raw, file = "test_raw.rds")
-
-################################################################ Pre-process a corpus composed of a vector of sequences Build a dictionnary
-################################################################ removing too rare words
-text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
-  raw_vec <- corpus
-  raw_vec <- stri_enc_toascii(str = raw_vec)
-  
-  ### remove non-printable characters
-  raw_vec <- str_replace_all(string = raw_vec, pattern = "[^[:print:]]", replacement = "")
-  raw_vec <- str_to_lower(string = raw_vec)
-  raw_vec <- str_replace_all(string = raw_vec, pattern = "_", replacement = " ")
-  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\bbr\\b", replacement = "")
-  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\s+", replacement = " ")
-  raw_vec <- str_trim(string = raw_vec)
-  
-  ### Split raw sequence vectors into lists of word vectors (one list element per
-  ### sequence)
-  word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T, 
-    skip_word_number = F, simplify = F)
-  
-  ### Build vocabulary
-  if (is.null(dic)) {
-    word_vec_unlist <- unlist(word_vec_list)
-    word_vec_table <- sort(table(word_vec_unlist), decreasing = T)
-    word_cutoff <- which.max(word_vec_table < count_threshold)
-    word_keep <- names(word_vec_table)[1:(word_cutoff - 1)]
-    stopwords <- c(letters, "an", "the", "br")
-    word_keep <- setdiff(word_keep, stopwords)
-  } else word_keep <- names(dic)[!dic == 0]
-  
-  ### Clean the sentences to keep only the curated list of words
-  word_vec_list <- lapply(word_vec_list, function(x) x[x %in% word_keep])
-  
-  # sentence_vec<- stri_split_boundaries(raw_vec, type='sentence', simplify = T)
-  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
-  
-  ### Build dictionnary
-  dic <- 1:length(word_keep)
-  names(dic) <- word_keep
-  dic <- c(`?` = 0, dic)
-  
-  ### reverse dictionnary
-  rev_dic <- names(dic)
-  names(rev_dic) <- dic
-  
-  return(list(word_vec_list = word_vec_list, dic = dic, rev_dic = rev_dic))
-}
-
-################################################################ 
-make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right_pad = T) {
-  ### Trunc sequence to max bucket length
-  word_vec_list <- lapply(word_vec_list, head, n = max(seq_len))
-  
-  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
-  bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T, 
-    labels = F)
-  # table(bucketID)
-  
-  ### Right or Left side Padding Pad sequences to their bucket length with
-  ### dictionnary 0-label
-  word_vec_list_pad <- lapply(1:length(word_vec_list), function(x) {
-    length(word_vec_list[[x]]) <- seq_len[bucketID[x]]
-    word_vec_list[[x]][is.na(word_vec_list[[x]])] <- names(dic[1])
-    if (right_pad == F) 
-      word_vec_list[[x]] <- rev(word_vec_list[[x]])
-    return(word_vec_list[[x]])
-  })
-  
-  ### Assign sequences to buckets and unroll them in order to be reshaped into arrays
-  unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID == 
-    x]))
-  
-  ### Assign labels to their buckets
-  bucketed_labels <- lapply(1:length(seq_len), function(x) labels[bucketID == x])
-  names(bucketed_labels) <- as.character(seq_len)
-  
-  ### Assign the dictionnary to each bucket terms
-  unrolled_arrays_dic <- lapply(1:length(seq_len), function(x) dic[unrolled_arrays[[x]]])
-  
-  # length(splitted_arrays_dic[[1]]) Reshape into arrays having each sequence into
-  # a column
-  features_arrays <- lapply(1:length(seq_len), function(x) array(unrolled_arrays_dic[[x]], 
-    dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
-  
-  features <- lapply(1:length(seq_len), function(x) features_arrays[[x]][1:seq_len[x], 
-    ])
-  names(features) <- as.character(seq_len)
-  
-  ### Combine data and labels into buckets
-  buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]), 
-    list(label = bucketed_labels[[x]])))
-  names(buckets) <- as.character(seq_len)
-  
-  ### reverse dictionnary
-  rev_dic <- names(dic)
-  names(rev_dic) <- dic
-  
-  return(list(buckets = buckets, dic = dic, rev_dic = rev_dic))
-}
-
-
-corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10, 
-  dic = NULL)
-
-# length(corpus_preprocessed_train$dic)
-
-corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_preprocessed_train$dic)
-
-saveRDS(corpus_preprocessed_train, file = "corpus_preprocessed_train_10.rds")
-saveRDS(corpus_preprocessed_test, file = "corpus_preprocessed_test_10.rds")
-
-corpus_preprocessed_train <- readRDS(file = "corpus_preprocessed_train_10.rds")
-corpus_preprocessed_test <- readRDS(file = "corpus_preprocessed_test_10.rds")
-
-
-corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
-  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_train$dic, seq_len = c(100, 
-    200, 300, 500, 800), right_pad = F)
-
-# lapply(corpus_bucketed_train$buckets, function(x) length(x[[2]]))
-
-
-corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list, 
-  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_test$dic, seq_len = c(100, 
-    200, 300, 500, 800), right_pad = F)
-
-# lapply(corpus_bucketed_test$buckets, function(x) length(x[[2]]))
-
-
-saveRDS(corpus_bucketed_train, file = "corpus_bucketed_train_100_200_300_500_800_left.rds")
-saveRDS(corpus_bucketed_test, file = "corpus_bucketed_test_100_200_300_500_800_left.rds")
diff --git a/example/rnn/bucket_R/data_preprocessing_seq_to_one.R b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
index 11c0a0ce4a..2ecb6ae8c4 100644
--- a/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
+++ b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
@@ -2,7 +2,7 @@
 if (!file.exists("data/aclImdb_v1.tar.gz")) {
   download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
                 "data/aclImdb_v1.tar.gz")
-  untar("data/aclImdb_v1.tar.gz")
+  untar("data/aclImdb_v1.tar.gz", exdir = "data/")
 }
 
 # install required packages
@@ -114,8 +114,8 @@ make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right
   
   # Reshape into arrays having each sequence into a row
   features <- lapply(1:length(seq_len), function(x) {
-    t(array(unrolled_arrays_dic[[x]], 
-          dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
+    array(unrolled_arrays_dic[[x]], 
+          dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x]))
   })
   
   names(features) <- as.character(seq_len)
@@ -141,7 +141,6 @@ corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_pre
 seq_length_dist <- unlist(lapply(corpus_preprocessed_train$word_vec_list, length))
 quantile(seq_length_dist, 0:20/20)
 
-
 # Save bucketed corpus
 corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
                                           labels = rep(0:1, each = 12500), 
@@ -158,7 +157,6 @@ corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_tes
 saveRDS(corpus_bucketed_train, file = "data/corpus_bucketed_train.rds")
 saveRDS(corpus_bucketed_test, file = "data/corpus_bucketed_test.rds")
 
-
 # Save non bucketed corpus
 corpus_single_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
                                           labels = rep(0:1, each = 12500), 
diff --git a/example/rnn/bucket_R/gru.cell.R b/example/rnn/bucket_R/gru.cell.R
deleted file mode 100644
index 5932cdf17e..0000000000
--- a/example/rnn/bucket_R/gru.cell.R
+++ /dev/null
@@ -1,54 +0,0 @@
-# GRU cell symbol
-gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, 
-  data_masking) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
-    bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t", 
-      seqidx, ".l", layeridx, ".gates.i2h"))
-  
-  if (dropout > 0) 
-    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-  
-  if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, 
-      bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t", 
-        seqidx, ".l", layeridx, ".gates.h2h"))
-    gates <- i2h + h2h
-  } else {
-    gates <- i2h
-  }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
-    name = paste0("t", seqidx, ".l", layeridx, ".split"))
-  
-  update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
-  reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
-  
-  htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
-    bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t", 
-      seqidx, ".l", layeridx, ".trans.i2h"))
-  
-  if (is.null(prev.state)) {
-    h.after.reset <- reset.gate * 0
-  } else {
-    h.after.reset <- prev.state$h * reset.gate
-  }
-  
-  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
-    bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t", 
-      seqidx, ".l", layeridx, ".trans.h2h"))
-  
-  h.trans <- htrans.i2h + htrans.h2h
-  h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
-  
-  if (is.null(prev.state)) {
-    next.h <- update.gate * h.trans.active
-  } else {
-    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
-  }
-  
-  ### Add a mask - using the mask_array approach
-  data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
-  next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-  
-  return(list(h = next.h))
-}
diff --git a/example/rnn/bucket_R/lstm.cell.R b/example/rnn/bucket_R/lstm.cell.R
deleted file mode 100644
index 3c7b0e456d..0000000000
--- a/example/rnn/bucket_R/lstm.cell.R
+++ /dev/null
@@ -1,41 +0,0 @@
-# LSTM cell symbol
-lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, 
-  data_masking) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
-    num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
-  
-  if (dropout > 0) 
-    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-  
-  if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, 
-      bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t", 
-        seqidx, ".l", layeridx, ".h2h"))
-    gates <- i2h + h2h
-  } else {
-    gates <- i2h
-  }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
-    name = paste0("t", seqidx, ".l", layeridx, ".slice"))
-  
-  in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
-  in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
-  forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid")
-  out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid")
-  
-  if (is.null(prev.state)) {
-    next.c <- in.gate * in.transform
-  } else {
-    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
-  }
-  
-  next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
-  
-  ### Add a mask - using the mask_array approach
-  data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
-  next.c <- mx.symbol.broadcast_mul(lhs = next.c, rhs = data_mask_expand)
-  next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-  
-  return(list(c = next.c, h = next.h))
-}
diff --git a/example/rnn/bucket_R/mx.io.bucket.iter.R b/example/rnn/bucket_R/mx.io.bucket.iter.R
deleted file mode 100644
index 61f87957ed..0000000000
--- a/example/rnn/bucket_R/mx.io.bucket.iter.R
+++ /dev/null
@@ -1,92 +0,0 @@
-BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "batch.size", 
-  "data.mask.element", "shuffle", "bucket.plan", "bucketID", "epoch", "batch", 
-  "batch.per.epoch", "seed"), contains = "Rcpp_MXArrayDataIter", methods = list(initialize = function(buckets, 
-  batch.size, data.mask.element = 0, shuffle = FALSE, seed = 123) {
-  .self$buckets <- buckets
-  .self$bucket.names <- names(.self$buckets)
-  .self$batch.size <- batch.size
-  .self$data.mask.element <- data.mask.element
-  .self$epoch <- 0
-  .self$batch <- 0
-  .self$shuffle <- shuffle
-  .self$batch.per.epoch <- 0
-  .self$bucket.plan <- NULL
-  .self$bucketID <- NULL
-  .self$seed <- seed
-  .self
-}, reset = function() {
-  buckets_nb <- length(bucket.names)
-  buckets_id <- 1:buckets_nb
-  buckets_size <- sapply(.self$buckets, function(x) {
-    dim(x$data)[length(dim(x$data))]
-  })
-  batch_per_bucket <- floor(buckets_size/.self$batch.size)
-  # Number of batches per epoch given the batch_size
-  .self$batch.per.epoch <- sum(batch_per_bucket)
-  .self$epoch <- .self$epoch + 1
-  .self$batch <- 0
-  
-  if (.self$shuffle) {
-    set.seed(.self$seed)
-    bucket_plan_names <- sample(rep(names(batch_per_bucket), times = batch_per_bucket))
-    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
-      FUN = cumsum)
-    names(.self$bucket.plan) <- bucket_plan_names
-    ### Return first BucketID at reset for initialization of the model
-    .self$bucketID <- .self$bucket.plan[1]
-    
-    .self$buckets <- lapply(.self$buckets, function(x) {
-      shuffle_id <- sample(ncol(x$data))
-      if (length(dim(x$label)) == 0) {
-        list(data = x$data[, shuffle_id], label = x$label[shuffle_id])
-      } else {
-        list(data = x$data[, shuffle_id], label = x$label[, shuffle_id])
-      }
-    })
-  } else {
-    bucket_plan_names <- rep(names(batch_per_bucket), times = batch_per_bucket)
-    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
-      FUN = cumsum)
-    names(.self$bucket.plan) <- bucket_plan_names
-  }
-}, iter.next = function() {
-  .self$batch <- .self$batch + 1
-  .self$bucketID <- .self$bucket.plan[batch]
-  if (.self$batch > .self$batch.per.epoch) {
-    return(FALSE)
-  } else {
-    return(TRUE)
-  }
-}, value = function() {
-  # bucketID is a named integer: the integer indicates the batch id for the given
-  # bucket (used to fetch appropriate samples within the bucket) the name is the a
-  # character containing the sequence length of the bucket (used to unroll the rnn
-  # to appropriate sequence length)
-  idx <- (.self$bucketID - 1) * (.self$batch.size) + (1:batch.size)
-  data <- .self$buckets[[names(.self$bucketID)]]$data[, idx, drop = F]
-  data_mask_array <- (!data == 0)
-  if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
-    label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
-  } else {
-    label <- .self$buckets[[names(.self$bucketID)]]$label[, idx, drop = F]
-  }
-  return(list(data = mx.nd.array(data), data.mask.array = mx.nd.array(data_mask_array), 
-    label = mx.nd.array(label)))
-}, finalize = function() {
-}))
-
-# 
-#' Create Bucket Iter
-#'
-#' @param buckets The data array.
-#' @param batch.size The batch size used to pack the array.
-#' @param data.mask.element The element to mask
-#' @param shuffle Whether shuffle the data
-#' @param seed The random seed
-#'
-#' @export
-mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffle = FALSE, 
-  seed = 123) {
-  return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element, 
-    shuffle = shuffle, seed = seed))
-}
diff --git a/example/rnn/bucket_R/rnn.R b/example/rnn/bucket_R/rnn.R
deleted file mode 100644
index ea02b959a7..0000000000
--- a/example/rnn/bucket_R/rnn.R
+++ /dev/null
@@ -1,208 +0,0 @@
-library(mxnet)
-
-source("lstm.cell.R")
-source("gru.cell.R")
-
-# unrolled RNN network
-rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.embed, num.hidden, 
-  num.label, dropout = 0, ignore_label = 0, init.state = NULL, config, cell.type = "lstm", 
-  output_last_state = F) {
-  embed.weight <- mx.symbol.Variable("embed.weight")
-  cls.weight <- mx.symbol.Variable("cls.weight")
-  cls.bias <- mx.symbol.Variable("cls.bias")
-  
-  param.cells <- lapply(1:num.rnn.layer, function(i) {
-    if (cell.type == "lstm") {
-      cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), 
-        i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), h2h.weight = mx.symbol.Variable(paste0("l", 
-          i, ".h2h.weight")), h2h.bias = mx.symbol.Variable(paste0("l", i, 
-          ".h2h.bias")))
-    } else if (cell.type == "gru") {
-      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")), 
-        gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")), 
-        gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")), 
-        gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")), 
-        trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")), 
-        trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")), 
-        trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")), 
-        trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
-    }
-    return(cell)
-  })
-  
-  # embeding layer
-  label <- mx.symbol.Variable("label")
-  data <- mx.symbol.Variable("data")
-  data_mask_array <- mx.symbol.Variable("data.mask.array")
-  data_mask_array <- mx.symbol.stop_gradient(data_mask_array, name = "data.mask.array")
-  
-  embed <- mx.symbol.Embedding(data = data, input_dim = input.size, weight = embed.weight, 
-    output_dim = num.embed, name = "embed")
-  
-  wordvec <- mx.symbol.split(data = embed, axis = 1, num.outputs = seq.len, squeeze_axis = T)
-  data_mask_split <- mx.symbol.split(data = data_mask_array, axis = 1, num.outputs = seq.len, 
-    squeeze_axis = T)
-  
-  last.hidden <- list()
-  last.states <- list()
-  decode <- list()
-  softmax <- list()
-  fc <- list()
-  
-  for (seqidx in 1:seq.len) {
-    hidden <- wordvec[[seqidx]]
-    
-    for (i in 1:num.rnn.layer) {
-      if (seqidx == 1) {
-        prev.state <- init.state[[i]]
-      } else {
-        prev.state <- last.states[[i]]
-      }
-      
-      if (cell.type == "lstm") {
-        cell.symbol <- lstm.cell
-      } else if (cell.type == "gru") {
-        cell.symbol <- gru.cell
-      }
-      
-      next.state <- cell.symbol(num.hidden = num.hidden, indata = hidden, prev.state = prev.state, 
-        param = param.cells[[i]], seqidx = seqidx, layeridx = i, dropout = dropout, 
-        data_masking = data_mask_split[[seqidx]])
-      hidden <- next.state$h
-      # if (dropout > 0) hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-      last.states[[i]] <- next.state
-    }
-    
-    # Decoding
-    if (config == "one-to-one") {
-      last.hidden <- c(last.hidden, hidden)
-    }
-  }
-  
-  if (config == "seq-to-one") {
-    fc <- mx.symbol.FullyConnected(data = hidden, weight = cls.weight, bias = cls.bias, 
-      num.hidden = num.label)
-    
-    loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-    
-  } else if (config == "one-to-one") {
-    last.hidden_expand <- lapply(last.hidden, function(i) mx.symbol.expand_dims(i, 
-      axis = 1))
-    concat <- mx.symbol.concat(last.hidden_expand, num.args = seq.len, dim = 1)
-    reshape <- mx.symbol.Reshape(concat, shape = c(num.hidden, -1))
-    
-    fc <- mx.symbol.FullyConnected(data = reshape, weight = cls.weight, bias = cls.bias, 
-      num.hidden = num.label)
-    
-    label <- mx.symbol.reshape(data = label, shape = c(-1))
-    loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-    
-  }
-  
-  if (output_last_state) {
-    group <- mx.symbol.Group(c(unlist(last.states), loss))
-    return(group)
-  } else {
-    return(loss)
-  }
-}
-
-########################################### mx.rnn.buckets
-mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidden, 
-  num.embed, num.label, input.size, ctx = NULL, num.round = 1, initializer = mx.init.uniform(0.01), 
-  dropout = 0, config = "one-to-one", optimizer = "sgd", batch.end.callback = NULL, 
-  epoch.end.callback = NULL, begin.round = 1, metric = mx.metric.rmse, cell.type = "lstm", 
-  kvstore = "local", verbose = FALSE) {
-  
-  if (!train.data$iter.next()) {
-    train.data$reset()
-    if (!train.data$iter.next()) 
-      stop("Empty train.data")
-  }
-  
-  if (!is.null(eval.data)) {
-    if (!eval.data$iter.next()) {
-      eval.data$reset()
-      if (!eval.data$iter.next()) 
-        stop("Empty eval.data")
-    }
-  }
-  
-  if (is.null(ctx)) 
-    ctx <- mx.ctx.default()
-  if (is.mx.context(ctx)) {
-    ctx <- list(ctx)
-  }
-  if (!is.list(ctx)) 
-    stop("ctx must be mx.context or list of mx.context")
-  if (is.character(optimizer)) {
-    if (is.numeric(input.shape)) {
-      ndim <- length(input.shape)
-      batchsize <- input.shape[[ndim]]
-    } else {
-      ndim <- length(input.shape[[1]])
-      batchsize <- input.shape[[1]][[ndim]]
-    }
-    optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
-  }
-  
-  # get unrolled lstm symbol
-  sym_list <- sapply(train.data$bucket.names, function(x) {
-    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x), 
-      input.size = input.size, num.embed = num.embed, num.label = num.label, 
-      dropout = dropout, cell.type = cell.type, config = config)
-  }, simplify = F, USE.NAMES = T)
-  
-  # setup lstm model
-  symbol <- sym_list[[names(train.data$bucketID)]]
-  
-  arg.names <- symbol$arguments
-  input.names <- c("data", "data.mask.array")
-  input.shape <- sapply(input.names, function(n) {
-    dim(train.data$value()[[n]])
-  }, simplify = FALSE)
-  output.names <- "label"
-  output.shape <- sapply(output.names, function(n) {
-    dim(train.data$value()[[n]])
-  }, simplify = FALSE)
-  
-  params <- mx.model.init.params(symbol, input.shape, output.shape, initializer, 
-    mx.cpu())
-  
-  kvstore <- mxnet:::mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), 
-    verbose = verbose)
-  
-  ### Execute training - rnn.model.R
-  model <- mx.model.train.rnn.buckets(sym_list = sym_list, input.shape = input.shape, 
-    output.shape = output.shape, arg.params = params$arg.params, aux.params = params$aux.params, 
-    optimizer = optimizer, train.data = train.data, eval.data = eval.data, verbose = verbose, 
-    begin.round = begin.round, end.round = num.round, metric = metric, ctx = ctx, 
-    batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback, 
-    kvstore = kvstore)
-  
-  return(model)
-}
-
-
-# get the argument name of data and label
-mx.model.check.arguments <- function(symbol) {
-  data <- NULL
-  label <- NULL
-  for (nm in arguments(symbol)) {
-    if (mx.util.str.endswith(nm, "data")) {
-      if (!is.null(data)) {
-        stop("Multiple fields contains suffix data")
-      } else {
-        data <- nm
-      }
-    }
-    if (mx.util.str.endswith(nm, "label")) {
-      if (!is.null(label)) {
-        stop("Multiple fields contains suffix label")
-      } else {
-        label <- nm
-      }
-    }
-  }
-  return(c(data, label))
-}
diff --git a/example/rnn/bucket_R/rnn.infer.R b/example/rnn/bucket_R/rnn.infer.R
deleted file mode 100644
index 41488aac89..0000000000
--- a/example/rnn/bucket_R/rnn.infer.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(mxnet)
-
-source("rnn.R")
-
-mx.rnn.infer.buckets <- function(infer_iter, model, config, ctx = mx.cpu(), output_last_state = FALSE, 
-  init.state = NULL, cell.type = "lstm") {
-  ### Infer parameters from model
-  if (cell.type == "lstm") {
-    num.rnn.layer <- round((length(model$arg.params) - 3)/4)
-    num.hidden <- dim(model$arg.params$l1.h2h.weight)[1]
-  } else if (cell.type == "gru") {
-    num.rnn.layer <- round((length(model$arg.params) - 3)/8)
-    num.hidden <- dim(model$arg.params$l1.gates.h2h.weight)[1]
-  }
-  
-  input.size <- dim(model$arg.params$embed.weight)[2]
-  num.embed <- dim(model$arg.params$embed.weight)[1]
-  num.label <- dim(model$arg.params$cls.bias)
-  
-  ### Initialise the iterator
-  infer_iter$reset()
-  infer_iter$iter.next()
-  batch_size <- infer_iter$batch.size
-  
-  # get unrolled lstm symbol
-  sym_list <- sapply(infer_iter$bucket.names, function(x) {
-    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x), 
-      input.size = input.size, num.embed = num.embed, num.label = num.label, 
-      config = config, dropout = 0, init.state = init.state, cell.type = cell.type, 
-      output_last_state = output_last_state)
-  }, simplify = F, USE.NAMES = T)
-  
-  symbol <- sym_list[[names(infer_iter$bucketID)]]
-  
-  input.shape <- lapply(infer_iter$value(), dim)
-  input.shape <- input.shape[names(input.shape) %in% arguments(symbol)]
-  
-  infer_shapes <- symbol$infer.shape(input.shape)
-  arg.params <- model$arg.params
-  aux.params <- model$aux.params
-  
-  input.names <- names(input.shape)
-  arg.names <- names(arg.params)
-  
-  # Grad request
-  grad_req <- rep("null", length(symbol$arguments))
-  
-  # Arg array order
-  update_names <- c(input.names, arg.names)
-  arg_update_idx <- match(symbol$arguments, update_names)
-  
-  # Initial input shapes - need to be adapted for multi-devices - divide highest
-  # dimension by device nb
-  s <- sapply(input.shape, function(shape) {
-    mx.nd.zeros(shape = shape, ctx = mx.cpu())
-  })
-  
-  train.execs <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx], 
-    aux.arrays = aux.params, ctx = ctx, grad.req = grad_req)
-  
-  packer <- mxnet:::mx.nd.arraypacker()
-  infer_iter$reset()
-  while (infer_iter$iter.next()) {
-    # Get input data slice
-    dlist <- infer_iter$value()[input.names]
-    
-    symbol <- sym_list[[names(infer_iter$bucketID)]]
-    
-    texec <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, train.execs$arg.arrays[arg.names])[arg_update_idx], 
-      aux.arrays = train.execs$aux.arrays, ctx = ctx, grad.req = grad_req)
-    
-    mx.exec.forward(texec, is.train = FALSE)
-    
-    out.preds <- mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
-    packer$push(out.preds)
-  }
-  infer_iter$reset()
-  return(packer$get())
-}
diff --git a/example/rnn/bucket_R/rnn.train.R b/example/rnn/bucket_R/rnn.train.R
deleted file mode 100644
index b833b2b1d3..0000000000
--- a/example/rnn/bucket_R/rnn.train.R
+++ /dev/null
@@ -1,206 +0,0 @@
-library(mxnet)
-
-source("rnn.R")
-
-# Internal function to do multiple device training on RNN
-mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, input.shape, 
-  output.shape, begin.round, end.round, optimizer, train.data, eval.data, metric, 
-  epoch.end.callback, batch.end.callback, kvstore, verbose = TRUE) {
-  symbol <- sym_list[[names(train.data$bucketID)]]
-  
-  input.names <- names(input.shape)
-  output.names <- names(output.shape)
-  arg.names <- names(arg.params)
-  
-  ndevice <- length(ctx)
-  if (verbose) 
-    message(paste0("Start training with ", ndevice, " devices"))
-  input_slice <- mxnet:::mx.model.slice.shape(input.shape, ndevice)
-  output_slice <- mxnet:::mx.model.slice.shape(output.shape, ndevice)
-  
-  
-  # Grad request
-  grad_req <- rep("write", length(symbol$arguments))
-  # grad_null_idx <- match(c(input.names, output.names), symbol$arguments)
-  grad_null_idx <- match(input.names, symbol$arguments)
-  grad_req[grad_null_idx] <- "null"
-  
-  # Arg array order
-  update_names <- c(input.names, output.names, arg.names)
-  arg_update_idx <- match(symbol$arguments, update_names)
-  
-  train.execs <- lapply(1:ndevice, function(i) {
-    s <- sapply(append(input_slice[[i]]$shape, output_slice[[i]]$shape), function(shape) {
-      mx.nd.zeros(shape = shape, ctx = mx.cpu())
-    })
-    mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx], 
-      aux.arrays = aux.params, ctx = mx.cpu(), grad.req = grad_req)
-  })
-  
-  # KVStore related stuffs
-  params.index <- as.integer(mxnet:::mx.util.filter.null(lapply(1:length(train.execs[[1]]$ref.grad.arrays), 
-    function(k) {
-      if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL
-    })))
-  update.on.kvstore <- FALSE
-  if (!is.null(kvstore) && kvstore$update.on.kvstore) {
-    update.on.kvstore <- TRUE
-    kvstore$set.optimizer(optimizer)
-  } else {
-    updaters <- lapply(1:ndevice, function(i) {
-      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
-    })
-  }
-  
-  if (!is.null(kvstore)) {
-    kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
-  }
-  
-  for (iteration in begin.round:end.round) {
-    nbatch <- 0
-    if (!is.null(metric)) {
-      train.metric <- metric$init()
-    }
-    train.data$reset()
-    while (train.data$iter.next()) {
-      dlist <- train.data$value()  #[input.names]
-      symbol <- sym_list[[names(train.data$bucketID)]]
-      slices <- lapply(1:ndevice, function(i) {
-        s <- input_slice[[i]]
-        ret <- sapply(names(dlist), function(n) {
-          mxnet:::mx.nd.slice(dlist[[n]], s$begin, s$end)
-        })
-        return(ret)
-      })
-      
-      train.execs <- lapply(1:ndevice, function(i) {
-        s <- slices[[i]]
-        mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx], 
-          aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
-      })
-      
-      for (texec in train.execs) {
-        mx.exec.forward(texec, is.train = TRUE)
-      }
-      
-      out.preds <- lapply(train.execs, function(texec) {
-        mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
-      })
-      
-      for (texec in train.execs) {
-        mx.exec.backward(texec)
-      }
-      
-      if (!is.null(kvstore)) {
-        # push the gradient
-        kvstore$push(params.index, lapply(train.execs, function(texec) {
-          texec$ref.grad.arrays[params.index]
-        }), -params.index)
-      }
-      if (update.on.kvstore) {
-        # pull back weight
-        kvstore$pull(params.index, lapply(train.execs, function(texec) {
-          texec$ref.arg.arrays[params.index]
-        }), -params.index)
-      } else {
-        # pull back gradient sums
-        if (!is.null(kvstore)) {
-          kvstore$pull(params.index, lapply(train.execs, function(texec) {
-          texec$ref.grad.arrays[params.index]
-          }), -params.index)
-        }
-        arg.blocks <- lapply(1:ndevice, function(i) {
-          updaters[[i]](train.execs[[i]]$ref.arg.arrays, train.execs[[i]]$ref.grad.arrays)
-        })
-        for (i in 1:ndevice) {
-          mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
-        }
-      }
-      
-      # Update the evaluation metrics
-      if (!is.null(metric)) {
-        # train.metric <- metric$update(dlist$label, out.preds, train.metric)
-        for (i in 1:ndevice) {
-          train.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
-          out.preds[[i]], train.metric)
-        }
-      }
-      
-      nbatch <- nbatch + 1
-      
-      if (!is.null(batch.end.callback)) {
-        batch.end.callback(iteration, nbatch, environment())
-      }
-    }
-    
-    if (!is.null(metric)) {
-      result <- metric$get(train.metric)
-      if (verbose) 
-        message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
-    }
-    
-    if (!is.null(eval.data)) {
-      if (!is.null(metric)) {
-        eval.metric <- metric$init()
-      }
-      eval.data$reset()
-      while (eval.data$iter.next()) {
-        # Get input data slice
-        dlist <- eval.data$value()  #[input.names]
-        symbol <- sym_list[[names(eval.data$bucketID)]]
-        slices <- lapply(1:ndevice, function(i) {
-          s <- input_slice[[i]]
-          ret <- sapply(names(dlist), function(n) {
-          mxnet:::mx.nd.slice(dlist[[n]], s$begin, s$end)
-          })
-          return(ret)
-        })
-        
-        
-        train.execs <- lapply(1:ndevice, function(i) {
-          s <- slices[[i]]
-          mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx], 
-          aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
-        })
-        
-        for (texec in train.execs) {
-          mx.exec.forward(texec, is.train = FALSE)
-        }
-        
-        # copy outputs to CPU
-        out.preds <- lapply(train.execs, function(texec) {
-          mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
-        })
-        
-        if (!is.null(metric)) {
-          for (i in 1:ndevice) {
-          eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
-            out.preds[[i]], eval.metric)
-          }
-        }
-      }
-      
-      if (!is.null(metric)) {
-        result <- metric$get(eval.metric)
-        if (verbose) {
-          message(paste0("[", iteration, "] Validation-", result$name, "=", 
-          result$value))
-        }
-      }
-    } else {
-      eval.metric <- NULL
-    }
-    # get the model out
-    model <- mxnet:::mx.model.extract.model(symbol, train.execs)
-    
-    epoch_continue <- TRUE
-    if (!is.null(epoch.end.callback)) {
-      epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
-    }
-    
-    if (!epoch_continue) {
-      break
-    }
-  }
-  return(model)
-}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services