You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/21 15:26:29 UTC
[GitHub] thirdwing closed pull request #9022: R RNN API fixes and Optimizer clip gradient on NDArray
thirdwing closed pull request #9022: R RNN API fixes and Optimizer clip gradient on NDArray
URL: https://github.com/apache/incubator-mxnet/pull/9022
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index 78a125ed51..3bf1d96dce 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -67,8 +67,8 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
train.execs <- lapply(seq_len(ndevice), function(i) {
s <- slices[[i]]
mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]],
- arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
- aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
+ arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
+ aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
})
} else {
for (i in seq_len(ndevice)) {
diff --git a/R-package/R/mx.io.bucket.iter.R b/R-package/R/mx.io.bucket.iter.R
index 22ac1fae6e..4d07ec218b 100644
--- a/R-package/R/mx.io.bucket.iter.R
+++ b/R-package/R/mx.io.bucket.iter.R
@@ -22,7 +22,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
buckets_nb <- length(bucket.names)
buckets_id <- seq_len(buckets_nb)
buckets.size <- sapply(.self$buckets, function(x) {
- dim(x$data)[length(dim(x$data)) - 1]
+ tail(dim(x$data), 1)
})
.self$batch.per.bucket <- ceiling(buckets.size/.self$batch.size)
.self$last.batch.pad <- .self$batch.size - buckets.size %% .self$batch.size
@@ -36,23 +36,23 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
if (.self$shuffle) {
set.seed(.self$seed)
- bucket_plan_names <- sample(rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
+ bucket_plan_names <- sample(rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
.self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
FUN = cumsum)
names(.self$bucket.plan) <- bucket_plan_names
- ### Return first BucketID at reset for initialization of the model
+ # Return first BucketID at reset for initialization of the model
.self$bucketID <- .self$bucket.plan[1]
.self$buckets <- lapply(.self$buckets, function(x) {
- shuffle_id <- sample.int(dim(x$data)[length(dim(x$data)) - 1])
+ shuffle_id <- sample(tail(dim(x$data), 1))
if (length(dim(x$label)) == 0) {
- list(data = x$data[shuffle_id, ], label = x$label[shuffle_id])
+ list(data = x$data[, shuffle_id], label = x$label[shuffle_id])
} else {
- list(data = x$data[shuffle_id, ], label = x$label[shuffle_id, ])
+ list(data = x$data[, shuffle_id], label = x$label[, shuffle_id])
}
})
} else {
- bucket_plan_names <- rep.int(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
+ bucket_plan_names <- rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
.self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
FUN = cumsum)
names(.self$bucket.plan) <- bucket_plan_names
@@ -60,25 +60,25 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
}, iter.next = function() {
.self$batch <- .self$batch + 1
.self$bucketID <- .self$bucket.plan[batch]
- return(.self$batch < .self$batch.per.epoch)
+ return(.self$batch <= .self$batch.per.epoch)
}, value = function() {
# bucketID is a named integer: the integer indicates the batch id for the given
- # bucket (used to fetch appropriate samples within the bucket) the name is the a
+ # bucket (used to fetch appropriate samples within the bucket) the name is a
# character containing the sequence length of the bucket (used to unroll the rnn
# to appropriate sequence length)
idx <- (.self$bucketID - 1) * (.self$batch.size) + seq_len(batch.size)
- ### reuse first idx for padding
+ # Reuse first idx for padding
if (bucketID == .self$batch.per.bucket[names(.self$bucketID)] & !.self$last.batch.pad[names(.self$bucketID)] == 0) {
idx <- c(idx[seq_len(.self$batch.size - .self$last.batch.pad[names(.self$bucketID)])], seq_len(.self$last.batch.pad[names(.self$bucketID)]))
}
- data <- .self$buckets[[names(.self$bucketID)]]$data[idx, , drop = FALSE]
- seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 1, sum)
+ data <- .self$buckets[[names(.self$bucketID)]]$data[, idx, drop = F]
+ seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 2, sum)
if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
} else {
- label <- .self$buckets[[names(.self$bucketID)]]$label[idx, , drop = FALSE]
+ label <- .self$buckets[[names(.self$bucketID)]]$label[, idx, drop = F]
}
return(list(data = mx.nd.array(data), seq.mask = mx.nd.array(seq.mask),
label = mx.nd.array(label)))
@@ -103,4 +103,4 @@ mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffl
seed = 123) {
return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element,
shuffle = shuffle, seed = seed))
-}
+}
\ No newline at end of file
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 253f031ba4..ff88531706 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -43,11 +43,7 @@ mx.opt.sgd <- function(learning.rate,
grad <- grad * rescale.grad
if (!is.null(clip_gradient)){
if(clip_gradient >= 0){
- grad_ctx <- ctx(grad)
- grad <- as.array(grad)
- grad <- pmax(grad, -1 * clip_gradient)
- grad <- pmin(grad, clip_gradient)
- grad <- mx.nd.array(grad, grad_ctx)
+ grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
} else {
stop("Error: clip_gradient should be positive number.")
}
@@ -125,11 +121,7 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
grad <- grad * rescale.grad
if (!is.null(clip_gradient)){
if(clip_gradient >= 0){
- grad_ctx <- ctx(grad)
- grad <- as.array(grad)
- grad <- pmax(grad, -1 * clip_gradient)
- grad <- pmin(grad, clip_gradient)
- grad <- mx.nd.array(grad, grad_ctx)
+ grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
} else {
stop("Error: clip_gradient should be positive number.")
}
@@ -225,11 +217,7 @@ mx.opt.adam <- function(learning.rate=0.001,
grad <- grad * rescale.grad
if (!is.null(clip_gradient)){
if(clip_gradient >= 0){
- grad_ctx <- ctx(grad)
- grad <- as.array(grad)
- grad <- pmax(grad, -1 * clip_gradient)
- grad <- pmin(grad, clip_gradient)
- grad <- mx.nd.array(grad, grad_ctx)
+ grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
} else {
stop("Error: clip_gradient should be positive number.")
}
@@ -309,11 +297,7 @@ mx.opt.adagrad <- function(learning.rate=0.05,
grad <- grad * rescale.grad
if (!is.null(clip_gradient)){
if(clip_gradient >= 0){
- grad_ctx <- ctx(grad)
- grad <- as.array(grad)
- grad <- pmax(grad, -1 * clip_gradient)
- grad <- pmin(grad, clip_gradient)
- grad <- mx.nd.array(grad, grad_ctx)
+ grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
} else {
stop("Error: clip_gradient should be positive number.")
}
@@ -363,11 +347,7 @@ mx.opt.adadelta <- function(rho=0.90,
grad <- grad * rescale.grad
if (!is.null(clip_gradient)){
if(clip_gradient >= 0){
- grad_ctx <- ctx(grad)
- grad <- as.array(grad)
- grad <- pmax(grad, -1 * clip_gradient)
- grad <- pmin(grad, clip_gradient)
- grad <- mx.nd.array(grad, grad_ctx)
+ grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
} else {
stop("Error: clip_gradient should be positive number.")
}
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
index 5197882000..171508cfce 100644
--- a/R-package/R/rnn.graph.R
+++ b/R-package/R/rnn.graph.R
@@ -1,127 +1,114 @@
-#
+
#' Generate a RNN symbolic model - requires CUDA
#'
#' @param config Either seq-to-one or one-to-one
-#' @param cell.type Type of RNN cell: either gru or lstm
-#' @param num.rnn.layer int, number of stacked layers
-#' @param num.hidden int, size of the state in each RNN layer
-#' @param num.embed int, default = NULL - no embedding. Dimension of the embedding vectors
-#' @param num.decode int, number of output variables in the decoding layer
-#' @param input.size int, number of levels in the data - only used for embedding
+#' @param cell_type Type of RNN cell: either gru or lstm
+#' @param num_rnn_layer int, number of stacked layers
+#' @param num_hidden int, size of the state in each RNN layer
+#' @param num_embed int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num_decode int, number of output variables in the decoding layer
+#' @param input_size int, number of levels in the data - only used for embedding
#' @param dropout
#'
#' @export
-rnn.graph <- function(num.rnn.layer,
- input.size = NULL,
- num.embed = NULL,
- num.hidden,
- num.decode,
- dropout = 0,
- ignore_label = -1,
- loss_output = NULL,
- config,
- cell.type,
- masking = FALSE,
- output_last_state = FALSE) {
-
- # define input arguments
+rnn.graph <- function (num_rnn_layer, input_size = NULL, num_embed = NULL,
+ num_hidden, num_decode, dropout = 0, ignore_label = -1, bidirectional = F,
+ loss_output = NULL, config, cell_type, masking = F, output_last_state = F,
+ rnn.state = NULL, rnn.state.cell = NULL, prefix = "") {
+
data <- mx.symbol.Variable("data")
label <- mx.symbol.Variable("label")
seq.mask <- mx.symbol.Variable("seq.mask")
-
- if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
-
+ if (!is.null(num_embed))
+ embed.weight <- mx.symbol.Variable("embed.weight")
rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
- rnn.state <- mx.symbol.Variable("rnn.state")
- if (cell.type == "lstm") {
+ if (is.null(rnn.state)) rnn.state <- mx.symbol.Variable("rnn.state")
+ if (cell_type == "lstm" & is.null(rnn.state.cell)) {
rnn.state.cell <- mx.symbol.Variable("rnn.state.cell")
}
cls.weight <- mx.symbol.Variable("cls.weight")
cls.bias <- mx.symbol.Variable("cls.bias")
-
- if (!is.null(num.embed)){
- data <- mx.symbol.Embedding(data=data, input_dim=input.size,
- weight=embed.weight, output_dim=num.embed, name="embed")
+ if (!is.null(num_embed)) {
+ data <- mx.symbol.Embedding(data = data, input_dim = input_size,
+ weight = embed.weight, output_dim = num_embed, name = "embed")
}
- # RNN cells
- if (cell.type == "lstm") {
- rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
-
+ data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
+
+ if (cell_type == "lstm") {
+ rnn <- mx.symbol.RNN(data = data, state = rnn.state,
+ state_cell = rnn.state.cell, parameters = rnn.params.weight,
+ state.size = num_hidden, num.layers = num_rnn_layer,
+ bidirectional = bidirectional, mode = cell_type, state.outputs = output_last_state,
+ p = dropout, name = paste0(prefix, "RNN"))
} else {
- rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=FALSE, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+ rnn <- mx.symbol.RNN(data = data, state = rnn.state,
+ parameters = rnn.params.weight, state.size = num_hidden,
+ num.layers = num_rnn_layer, bidirectional = bidirectional, mode = cell_type,
+ state.outputs = output_last_state, p = dropout,
+ name = paste0(prefix, "RNN"))
}
- # Decode
- if (config=="seq-to-one") {
-
- if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, name = "mask") else
- mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = FALSE, name = "mask")
-
- decode <- mx.symbol.FullyConnected(data=mask,
- weight=cls.weight,
- bias=cls.bias,
- num.hidden=num.decode,
- name = "decode")
+ if (config == "seq-to-one") {
+ if (masking) mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
+ mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = F, name = "mask")
if (!is.null(loss_output)) {
- loss <- switch(loss_output,
- softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
- linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
- logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
- MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
- )
- } else loss <- decode
-
- } else if (config=="one-to-one"){
+ decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, name = "decode")
+ out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+ linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"),
+ logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"),
+ MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
+ }
+ else out <- mask
+ }
+
+ else if (config == "one-to-one") {
- if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = TRUE, sequence_length = seq.mask, value = 0, name = "mask") else
+ if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
- mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
-
- decode <- mx.symbol.FullyConnected(data=reshape,
- weight=cls.weight,
- bias=cls.bias,
- num.hidden=num.decode,
- name = "decode")
-
- label <- mx.symbol.reshape(data=label, shape=c(-1), name = "label_reshape")
+ mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
if (!is.null(loss_output)) {
- loss <- switch(loss_output,
- softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
- linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
- logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
- MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
- )
- } else loss <- decode
+
+ mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
+ label <- mx.symbol.reshape(data = label, shape = c(-1))
+
+ decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode,
+ flatten = TRUE, name = paste0(prefix, "decode"))
+
+ out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+ linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"),
+ logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"),
+ MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
+ } else out <- mask
}
- return(loss)
+ return(out)
}
-
# LSTM cell symbol
-lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
- i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias,
- num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
+lstm.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix = "") {
- if (dropout > 0)
- i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+ if (dropout > 0 && layeridx > 1)
+ indata <- mx.symbol.Dropout(data = indata, p = dropout)
+
+ i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias,
+ num_hidden = num_hidden * 4, name = paste0(prefix, "t", seqidx, ".l", layeridx, ".i2h"))
if (!is.null(prev.state)) {
h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight,
- bias = param$h2h.bias, num.hidden = num.hidden * 4,
- name = paste0("t", seqidx, ".l", layeridx, ".h2h"))
+ bias = param$h2h.bias, num_hidden = num_hidden * 4,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".h2h"))
gates <- i2h + h2h
} else {
gates <- i2h
}
- split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = FALSE,
- name = paste0("t", seqidx, ".l", layeridx, ".slice"))
+ split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".slice"))
in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
@@ -136,42 +123,48 @@ lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, d
next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
- return(list(c = next.c, h = next.h))
+ return(list(h = next.h, c = next.c))
}
+
# GRU cell symbol
-gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
- i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight,
- bias = param$gates.i2h.bias, num.hidden = num.hidden * 2,
- name = paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
+gru.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix)
+{
+ if (dropout > 0 && layeridx > 1)
+ indata <- mx.symbol.Dropout(data = indata, p = dropout)
- if (dropout > 0)
- i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+ i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight,
+ bias = param$gates.i2h.bias, num_hidden = num_hidden * 2,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.i2h"))
if (!is.null(prev.state)) {
h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight,
- bias = param$gates.h2h.bias, num.hidden = num.hidden * 2,
- name = paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
+ bias = param$gates.h2h.bias, num_hidden = num_hidden * 2,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.h2h"))
gates <- i2h + h2h
} else {
gates <- i2h
}
-
- split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = FALSE,
- name = paste0("t", seqidx, ".l", layeridx, ".split"))
+
+ split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".split"))
update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight,
- bias = param$trans.i2h.bias, num.hidden = num.hidden,
- name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
-
- h.after.reset <- reset.gate * (if (is.null(prev.state)) 0 else prev.state$h)
-
- htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
- bias = param$trans.h2h.bias, num.hidden = num.hidden,
- name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
+ bias = param$trans.i2h.bias, num_hidden = num_hidden,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.i2h"))
+
+ if (is.null(prev.state)) {
+ h.after.reset <- reset.gate * 0
+ } else {
+ h.after.reset <- prev.state$h * reset.gate
+ }
+
+ htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
+ bias = param$trans.h2h.bias, num_hidden = num_hidden,
+ name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.h2h"))
h.trans <- htrans.i2h + htrans.h2h
h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
@@ -185,86 +178,116 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr
return(list(h = next.h))
}
-#
-#' unroll representation of RNN running on non CUDA device - under development
+
+#' unroll representation of RNN running on non CUDA device
+#'
+#' @param config Either seq-to-one or one-to-one
+#' @param cell_type Type of RNN cell: either gru or lstm
+#' @param num_rnn_layer int, number of stacked layers
+#' @param seq_len int, number of time steps to unroll
+#' @param num_hidden int, size of the state in each RNN layer
+#' @param num_embed int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num_decode int, number of output variables in the decoding layer
+#' @param input_size int, number of levels in the data - only used for embedding
+#' @param dropout
#'
#' @export
-rnn.graph.unroll <- function(num.rnn.layer,
- seq.len,
- input.size = NULL,
- num.embed = NULL,
- num.hidden,
- num.decode,
+rnn.graph.unroll <- function(num_rnn_layer,
+ seq_len,
+ input_size = NULL,
+ num_embed = NULL,
+ num_hidden,
+ num_decode,
dropout = 0,
ignore_label = -1,
loss_output = NULL,
init.state = NULL,
config,
- cell.type = "lstm",
- masking = FALSE,
- output_last_state = FALSE) {
-
-
- if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
+ cell_type = "lstm",
+ masking = F,
+ output_last_state = F,
+ prefix = "",
+ data_name = "data",
+ label_name = "label") {
+
+ if (!is.null(num_embed)) embed.weight <- mx.symbol.Variable(paste0(prefix, "embed.weight"))
+
+ # Initial state
+ if (is.null(init.state) & output_last_state) {
+ init.state <- lapply(1:num_rnn_layer, function(i) {
+ if (cell_type=="lstm") {
+ state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")),
+ c = mx.symbol.Variable(paste0("init_", prefix, i, "_c")))
+ } else if (cell_type=="gru") {
+ state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")))
+ }
+ return (state)
+ })
+ }
- cls.weight <- mx.symbol.Variable("cls.weight")
- cls.bias <- mx.symbol.Variable("cls.bias")
+ cls.weight <- mx.symbol.Variable(paste0(prefix, "cls.weight"))
+ cls.bias <- mx.symbol.Variable(paste0(prefix, "cls.bias"))
- param.cells <- lapply(seq_len(num.rnn.layer), function(i) {
+ param.cells <- lapply(1:num_rnn_layer, function(i) {
- if (cell.type=="lstm"){
- cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
- i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
- h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
- h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
- } else if (cell.type=="gru"){
- cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
- gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
- gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
- gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
- trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
- trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
- trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
- trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+ if (cell_type=="lstm") {
+ cell <- list(i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.weight")),
+ i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.bias")),
+ h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.weight")),
+ h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.bias")))
+ } else if (cell_type=="gru") {
+ cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.weight")),
+ gates.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.bias")),
+ gates.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.weight")),
+ gates.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.bias")),
+ trans.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.weight")),
+ trans.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.bias")),
+ trans.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.weight")),
+ trans.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.bias")))
}
return (cell)
})
# embeding layer
- data <- mx.symbol.Variable("data")
- label <- mx.symbol.Variable("label")
- seq.mask <- mx.symbol.Variable("seq.mask")
+ data <- mx.symbol.Variable(data_name)
+ label <- mx.symbol.Variable(label_name)
+ seq.mask <- mx.symbol.Variable(paste0(prefix, "seq.mask"))
- if (!is.null(num.embed)) {
- data <- mx.symbol.Embedding(data = data, input_dim = input.size,
- weight=embed.weight, output_dim = num.embed, name = "embed")
+ data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
+
+ if (!is.null(num_embed)) {
+ data <- mx.symbol.Embedding(data = data, input_dim = input_size,
+ weight=embed.weight, output_dim = num_embed, name = paste0(prefix, "embed"))
}
- data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = TRUE)
+ data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq_len, squeeze_axis = T)
last.hidden <- list()
last.states <- list()
- for (seqidx in seq_len(seq.len)) {
+ for (seqidx in 1:seq_len) {
hidden <- data[[seqidx]]
- for (i in seq_len(num.rnn.layer)) {
+ for (i in 1:num_rnn_layer) {
- if (seqidx==1) prev.state<- init.state[[i]] else prev.state <- last.states[[i]]
+ if (seqidx==1) prev.state <- init.state[[i]] else
+ prev.state <- last.states[[i]]
- if (cell.type=="lstm") {
+ if (cell_type=="lstm") {
cell.symbol <- lstm.cell
- } else if (cell.type=="gru"){
+ } else if (cell_type=="gru"){
cell.symbol <- gru.cell
}
- next.state <- cell.symbol(num.hidden = num.hidden,
+ next.state <- cell.symbol(num_hidden = num_hidden,
indata = hidden,
prev.state = prev.state,
param = param.cells[[i]],
seqidx = seqidx,
layeridx = i,
- dropout = dropout)
+ dropout = dropout,
+ prefix = prefix)
+
hidden <- next.state$h
last.states[[i]] <- next.state
}
@@ -273,53 +296,61 @@ rnn.graph.unroll <- function(num.rnn.layer,
last.hidden <- c(last.hidden, hidden)
}
- # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size
- concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat")
- concat <- mx.symbol.reshape(data = concat, shape = c(num.hidden, -1, seq.len), name = "rnn_reshape")
+ if (output_last_state) {
+ out.states = mx.symbol.Group(unlist(last.states))
+ }
- if (config=="seq-to-one"){
-
- if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
- mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = "mask")
+ # concat hidden units - concat seq_len blocks of dimension num_hidden x batch.size
+ concat <- mx.symbol.concat(data = last.hidden, num.args = seq_len, dim = 0, name = paste0(prefix, "concat"))
+ concat <- mx.symbol.reshape(data = concat, shape = c(num_hidden, -1, seq_len), name = paste0(prefix, "rnn_reshape"))
+
+ if (config=="seq-to-one") {
- decode <- mx.symbol.FullyConnected(data = mask,
- weight = cls.weight,
- bias = cls.bias,
- num.hidden = num.decode,
- name = "decode")
+ if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = paste0(prefix, "mask")) else
+ mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = paste0(prefix, "mask"))
if (!is.null(loss_output)) {
- loss <- switch(loss_output,
- softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
- linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
- logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
- MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+
+ decode <- mx.symbol.FullyConnected(data = mask,
+ weight = cls.weight,
+ bias = cls.bias,
+ num_hidden = num_decode,
+ name = paste0(prefix, "decode"))
+
+ out <- switch(loss_output,
+ softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = paste0(prefix, "loss")),
+ linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+ logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, paste0(prefix, name = "loss")),
+ MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, paste0(prefix, name = "loss"))
)
- } else loss <- decode
+ } else out <- mask
} else if (config=="one-to-one"){
- if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
- mask <- mx.symbol.identity(data = concat, name = "mask")
-
- mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
-
- decode <- mx.symbol.FullyConnected(data = mask,
- weight = cls.weight,
- bias = cls.bias,
- num.hidden = num.decode,
- name = "decode")
+ if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = paste0(prefix, "mask")) else
+ mask <- mx.symbol.identity(data = concat, name = paste0(prefix, "mask"))
- label <- mx.symbol.reshape(data = label, shape = -1, name = "label_reshape")
+ mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
if (!is.null(loss_output)) {
- loss <- switch(loss_output,
- softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
- linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
- logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
- MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+
+ mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
+ label <- mx.symbol.reshape(data = label, shape = c(-1))
+
+ decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode,
+ flatten = T, name = paste0(prefix, "decode"))
+
+ out <- switch(loss_output,
+ softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label,
+ name = paste0(prefix, "loss")),
+ linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+ logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
+ MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = paste0(prefix, "loss"))
)
- } else loss <- decode
+ } else out <- mask
}
- return(loss)
+
+ if (output_last_state) {
+ return(mx.symbol.Group(c(out, out.states)))
+ } else return(out)
}
diff --git a/R-package/R/rnn.infer.R b/R-package/R/rnn.infer.R
index a22bae0eb3..588056f2eb 100644
--- a/R-package/R/rnn.infer.R
+++ b/R-package/R/rnn.infer.R
@@ -1,12 +1,12 @@
-#
+
#' Inference of RNN model
#'
#' @param infer.data Data iterator created by mx.io.bucket.iter
#' @param model Model used for inference
-#' @param ctx The element to mask
+#' @param ctx
#'
#' @export
-mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
+mx.infer.rnn <- function(infer.data, model, ctx = mx.cpu()) {
### Initialise the iterator
infer.data$reset()
@@ -52,7 +52,7 @@ mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
arg.params.fix <- arguments.ini[arg.params.fix.names]
# Grad request
- grad.req <- rep.int("null", length(arguments))
+ grad.req <- rep("null", length(arguments))
# Arg array order
update_names <- c(input.names, arg.params.fix.names, arg.params.names)
@@ -88,11 +88,22 @@ mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
}
-
-### inference for one-to-one models
-mx.infer.buckets.one <- function(infer.data,
- symbol, arg.params, aux.params, input.params = NULL,
- ctx = mx.cpu()) {
+#' Inference for one-to-one fusedRNN (CUDA) models
+#'
+#' @param infer.data Data iterator created by mx.io.bucket.iter
+#' @param symbol Symbol used for inference
+#' @param arg.params
+#' @param aux.params
+#' @param input.params
+#' @param ctx
+#'
+#' @export
+mx.infer.rnn.one <- function(infer.data,
+ symbol,
+ arg.params,
+ aux.params,
+ input.params = NULL,
+ ctx = mx.cpu()) {
### Initialise the iterator
infer.data$reset()
@@ -138,7 +149,7 @@ mx.infer.buckets.one <- function(infer.data,
aux.params <- aux.params
# Grad request
- grad.req <- rep.int("null", length(arguments))
+ grad.req <- rep("null", length(arguments))
# Arg array order
update_names <- c(input.names, arg.params.fix.names, arg.params.names)
@@ -146,8 +157,8 @@ mx.infer.buckets.one <- function(infer.data,
# Initial binding
execs <- mx.symbol.bind(symbol = symbol,
- arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx],
- aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+ arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx],
+ aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
# Initial input shapes - need to be adapted for multi-devices - divide highest
# dimension by device nb
@@ -159,8 +170,8 @@ mx.infer.buckets.one <- function(infer.data,
dlist <- infer.data$value()[input.names]
execs <- mx.symbol.bind(symbol = symbol,
- arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx],
- aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
+ arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx],
+ aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
mx.exec.forward(execs, is.train = FALSE)
@@ -175,3 +186,85 @@ mx.infer.buckets.one <- function(infer.data,
infer.data$reset()
return(out)
}
+
+
+#' Inference for one-to-one unroll models
+#'
+#' @param infer.data NDArray
+#' @param symbol Model used for inference
+#' @param num_hidden
+#' @param arg.params
+#' @param aux.params
+#' @param init_states
+#' @param ctx
+#'
+#' @export
+mx.infer.rnn.one.unroll <- function(infer.data,
+ symbol,
+ num_hidden,
+ arg.params,
+ aux.params,
+ init_states = NULL,
+ ctx = mx.cpu()) {
+
+ if (is.null(ctx))
+ ctx <- mx.ctx.default()
+ if (is.mx.context(ctx)) {
+ ctx <- list(ctx)
+ }
+
+ if (!is.list(ctx))
+ stop("ctx must be mx.context or list of mx.context")
+
+ ndevice <- length(ctx)
+
+ arguments <- symbol$arguments
+ input.names <- intersect(c("data", "label"), arguments)
+
+ input.shape <- list("data" = dim(infer.data), "label" = dim(infer.data))
+
+ # init_state_shapes
+ init_states_names <- arguments[startsWith(arguments, "init_")]
+ init_states_shapes = lapply(init_states_names, function(x) c(num_hidden, tail(input.shape[[1]], 1)))
+ names(init_states_shapes) <- init_states_names
+
+ shapes <- symbol$infer.shape(c(input.shape, init_states_shapes))
+
+ # initialize all arguments with zeros
+ arguments.ini <- lapply(shapes$arg.shapes, function(shape) {
+ mx.nd.zeros(shape = shape, ctx = mx.cpu())
+ })
+
+ dlist <- list("data" = infer.data, "label" = infer.data)
+
+ if (is.null(init_states)) {
+ init_states <- arguments.ini[init_states_names]
+ } else {
+ names(init_states) <- init_states_names
+ }
+
+ # remove potential duplicates arguments - if inference on CUDA RNN symbol
+ arg.params <- arg.params[setdiff(names(arg.params), c(input.names, init_states_names))]
+ arg.params.names <- names(arg.params)
+
+ # Aux params
+ aux.params <- aux.params
+
+ # Grad request
+ grad.req <- rep("null", length(arguments))
+
+ # Arg array order
+ update_names <- c(input.names, init_states_names, arg.params.names)
+ arg_update_idx <- match(arguments, update_names)
+
+ # Bind to exec
+ execs <- mxnet:::mx.symbol.bind(symbol = symbol,
+ arg.arrays = c(dlist, init_states, arg.params)[arg_update_idx],
+ aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+
+ mx.exec.forward(execs, is.train = FALSE)
+
+ out <- lapply(execs$ref.outputs, function(out) mx.nd.copyto(out, mx.cpu()))
+
+ return(out)
+}
diff --git a/example/rnn/bucket_R/aclImdb_lstm_classification.R b/example/rnn/bucket_R/aclImdb_lstm_classification.R
index bb5eaacf26..6c982c5cb4 100644
--- a/example/rnn/bucket_R/aclImdb_lstm_classification.R
+++ b/example/rnn/bucket_R/aclImdb_lstm_classification.R
@@ -1,10 +1,7 @@
require("mxnet")
-source("mx.io.bucket.iter.R")
-source("rnn.train.R")
-
-corpus_bucketed_train <- readRDS(file = "corpus_bucketed_train_100_200_300_500_800_left.rds")
-corpus_bucketed_test <- readRDS(file = "corpus_bucketed_test_100_200_300_500_800_left.rds")
+corpus_bucketed_train <- readRDS(file = "data/corpus_bucketed_train.rds")
+corpus_bucketed_test <- readRDS(file = "data/corpus_bucketed_test.rds")
vocab <- length(corpus_bucketed_test$dic)
@@ -23,20 +20,28 @@ mx.set.seed(0)
optimizer <- mx.opt.create("adadelta", rho = 0.92, epsilon = 1e-06, wd = 2e-04, clip_gradient = NULL,
rescale.grad = 1/batch.size)
-model_sentiment_lstm <- mx.rnn.buckets(train.data = train.data, begin.round = 1,
- num.round = num.round, ctx = mx.cpu(), metric = mx.metric.accuracy, optimizer = optimizer,
- num.rnn.layer = 2, num.embed = 16, num.hidden = 24, num.label = 2, input.size = vocab,
- initializer = mx.init.Xavier(rnd_type = "gaussian", factor_type = "in", magnitude = 2),
- dropout = 0.25, config = "seq-to-one", batch.end.callback = mx.callback.log.train.metric(period = 50),
- verbose = TRUE)
+bucket_list <- unique(c(train.data$bucket.names, eval.data$bucket.names))
-mx.model.save(model_sentiment_lstm, prefix = "model_sentiment_lstm", iteration = num.round)
+symbol_buckets <- sapply(bucket_list, function(seq) {
+ rnn.graph(config = "seq-to-one", cell_type = "lstm",
+ num_rnn_layer = 1, num_embed = 2, num_hidden = 6,
+ num_decode = 2, input_size = vocab, dropout = 0.5,
+ ignore_label = -1, loss_output = "softmax",
+ output_last_state = F, masking = T)
+})
-source("rnn.infer.R")
+model_sentiment_lstm <- mx.model.buckets(symbol = symbol_buckets,
+ train.data = train.data, eval.data = eval.data,
+ num.round = num.round, ctx = devices, verbose = FALSE,
+ metric = mx.metric.accuracy, optimizer = optimizer,
+ initializer = initializer,
+ batch.end.callback = NULL,
+ epoch.end.callback = epoch.end.callback)
+mx.model.save(model_sentiment_lstm, prefix = "model_sentiment_lstm", iteration = num.round)
model <- mx.model.load("model_sentiment_lstm", iteration = num.round)
-pred <- mx.rnn.infer.buckets(infer_iter = eval.data, model, "seq-to-one", ctx = mx.cpu())
+pred <- mx.infer.rnn(infer.data = eval.data, model = model, ctx = mx.cpu())
ypred <- max.col(t(as.array(pred)), tie = "first") - 1
diff --git a/example/rnn/bucket_R/data_preprocessing.R b/example/rnn/bucket_R/data_preprocessing.R
deleted file mode 100644
index c91e3fb5eb..0000000000
--- a/example/rnn/bucket_R/data_preprocessing.R
+++ /dev/null
@@ -1,170 +0,0 @@
-# download the IMDB dataset
-if (!file.exists("aclImdb_v1.tar.gz")) {
- download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
- "aclImdb_v1.tar.gz")
- untar("aclImdb_v1.tar.gz")
-}
-
-# install required packages
-list.of.packages <- c("readr", "dplyr", "stringr", "stringi")
-new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
-if (length(new.packages)) install.packages(new.packages)
-
-require("readr")
-require("dplyr")
-require("stringr")
-require("stringi")
-
-negative_train_list <- list.files("./aclImdb/train/neg/", full.names = T)
-positive_train_list <- list.files("./aclImdb/train/pos/", full.names = T)
-
-negative_test_list <- list.files("./aclImdb/test/neg/", full.names = T)
-positive_test_list <- list.files("./aclImdb/test/pos/", full.names = T)
-
-file_import <- function(file_list) {
- import <- sapply(file_list, read_file)
- return(import)
-}
-
-negative_train_raw <- file_import(negative_train_list)
-positive_train_raw <- file_import(positive_train_list)
-
-negative_test_raw <- file_import(negative_test_list)
-positive_test_raw <- file_import(positive_test_list)
-
-train_raw <- c(negative_train_raw, positive_train_raw)
-test_raw <- c(negative_test_raw, positive_test_raw)
-
-saveRDS(train_raw, file = "train_raw.rds")
-saveRDS(test_raw, file = "test_raw.rds")
-
-################################################################ Pre-process a corpus composed of a vector of sequences Build a dictionnary
-################################################################ removing too rare words
-text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
- raw_vec <- corpus
- raw_vec <- stri_enc_toascii(str = raw_vec)
-
- ### remove non-printable characters
- raw_vec <- str_replace_all(string = raw_vec, pattern = "[^[:print:]]", replacement = "")
- raw_vec <- str_to_lower(string = raw_vec)
- raw_vec <- str_replace_all(string = raw_vec, pattern = "_", replacement = " ")
- raw_vec <- str_replace_all(string = raw_vec, pattern = "\\bbr\\b", replacement = "")
- raw_vec <- str_replace_all(string = raw_vec, pattern = "\\s+", replacement = " ")
- raw_vec <- str_trim(string = raw_vec)
-
- ### Split raw sequence vectors into lists of word vectors (one list element per
- ### sequence)
- word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T,
- skip_word_number = F, simplify = F)
-
- ### Build vocabulary
- if (is.null(dic)) {
- word_vec_unlist <- unlist(word_vec_list)
- word_vec_table <- sort(table(word_vec_unlist), decreasing = T)
- word_cutoff <- which.max(word_vec_table < count_threshold)
- word_keep <- names(word_vec_table)[1:(word_cutoff - 1)]
- stopwords <- c(letters, "an", "the", "br")
- word_keep <- setdiff(word_keep, stopwords)
- } else word_keep <- names(dic)[!dic == 0]
-
- ### Clean the sentences to keep only the curated list of words
- word_vec_list <- lapply(word_vec_list, function(x) x[x %in% word_keep])
-
- # sentence_vec<- stri_split_boundaries(raw_vec, type='sentence', simplify = T)
- word_vec_length <- lapply(word_vec_list, length) %>% unlist()
-
- ### Build dictionnary
- dic <- 1:length(word_keep)
- names(dic) <- word_keep
- dic <- c(`?` = 0, dic)
-
- ### reverse dictionnary
- rev_dic <- names(dic)
- names(rev_dic) <- dic
-
- return(list(word_vec_list = word_vec_list, dic = dic, rev_dic = rev_dic))
-}
-
-################################################################
-make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right_pad = T) {
- ### Trunc sequence to max bucket length
- word_vec_list <- lapply(word_vec_list, head, n = max(seq_len))
-
- word_vec_length <- lapply(word_vec_list, length) %>% unlist()
- bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T,
- labels = F)
- # table(bucketID)
-
- ### Right or Left side Padding Pad sequences to their bucket length with
- ### dictionnary 0-label
- word_vec_list_pad <- lapply(1:length(word_vec_list), function(x) {
- length(word_vec_list[[x]]) <- seq_len[bucketID[x]]
- word_vec_list[[x]][is.na(word_vec_list[[x]])] <- names(dic[1])
- if (right_pad == F)
- word_vec_list[[x]] <- rev(word_vec_list[[x]])
- return(word_vec_list[[x]])
- })
-
- ### Assign sequences to buckets and unroll them in order to be reshaped into arrays
- unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID ==
- x]))
-
- ### Assign labels to their buckets
- bucketed_labels <- lapply(1:length(seq_len), function(x) labels[bucketID == x])
- names(bucketed_labels) <- as.character(seq_len)
-
- ### Assign the dictionnary to each bucket terms
- unrolled_arrays_dic <- lapply(1:length(seq_len), function(x) dic[unrolled_arrays[[x]]])
-
- # length(splitted_arrays_dic[[1]]) Reshape into arrays having each sequence into
- # a column
- features_arrays <- lapply(1:length(seq_len), function(x) array(unrolled_arrays_dic[[x]],
- dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
-
- features <- lapply(1:length(seq_len), function(x) features_arrays[[x]][1:seq_len[x],
- ])
- names(features) <- as.character(seq_len)
-
- ### Combine data and labels into buckets
- buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]),
- list(label = bucketed_labels[[x]])))
- names(buckets) <- as.character(seq_len)
-
- ### reverse dictionnary
- rev_dic <- names(dic)
- names(rev_dic) <- dic
-
- return(list(buckets = buckets, dic = dic, rev_dic = rev_dic))
-}
-
-
-corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10,
- dic = NULL)
-
-# length(corpus_preprocessed_train$dic)
-
-corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_preprocessed_train$dic)
-
-saveRDS(corpus_preprocessed_train, file = "corpus_preprocessed_train_10.rds")
-saveRDS(corpus_preprocessed_test, file = "corpus_preprocessed_test_10.rds")
-
-corpus_preprocessed_train <- readRDS(file = "corpus_preprocessed_train_10.rds")
-corpus_preprocessed_test <- readRDS(file = "corpus_preprocessed_test_10.rds")
-
-
-corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
- labels = rep(0:1, each = 12500), dic = corpus_preprocessed_train$dic, seq_len = c(100,
- 200, 300, 500, 800), right_pad = F)
-
-# lapply(corpus_bucketed_train$buckets, function(x) length(x[[2]]))
-
-
-corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list,
- labels = rep(0:1, each = 12500), dic = corpus_preprocessed_test$dic, seq_len = c(100,
- 200, 300, 500, 800), right_pad = F)
-
-# lapply(corpus_bucketed_test$buckets, function(x) length(x[[2]]))
-
-
-saveRDS(corpus_bucketed_train, file = "corpus_bucketed_train_100_200_300_500_800_left.rds")
-saveRDS(corpus_bucketed_test, file = "corpus_bucketed_test_100_200_300_500_800_left.rds")
diff --git a/example/rnn/bucket_R/data_preprocessing_seq_to_one.R b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
index 11c0a0ce4a..2ecb6ae8c4 100644
--- a/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
+++ b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
@@ -2,7 +2,7 @@
if (!file.exists("data/aclImdb_v1.tar.gz")) {
download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
"data/aclImdb_v1.tar.gz")
- untar("data/aclImdb_v1.tar.gz")
+ untar("data/aclImdb_v1.tar.gz", exdir = "data/")
}
# install required packages
@@ -114,8 +114,8 @@ make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right
# Reshape into arrays having each sequence into a row
features <- lapply(1:length(seq_len), function(x) {
- t(array(unrolled_arrays_dic[[x]],
- dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
+ array(unrolled_arrays_dic[[x]],
+ dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x]))
})
names(features) <- as.character(seq_len)
@@ -141,7 +141,6 @@ corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_pre
seq_length_dist <- unlist(lapply(corpus_preprocessed_train$word_vec_list, length))
quantile(seq_length_dist, 0:20/20)
-
# Save bucketed corpus
corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
labels = rep(0:1, each = 12500),
@@ -158,7 +157,6 @@ corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_tes
saveRDS(corpus_bucketed_train, file = "data/corpus_bucketed_train.rds")
saveRDS(corpus_bucketed_test, file = "data/corpus_bucketed_test.rds")
-
# Save non bucketed corpus
corpus_single_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
labels = rep(0:1, each = 12500),
diff --git a/example/rnn/bucket_R/gru.cell.R b/example/rnn/bucket_R/gru.cell.R
deleted file mode 100644
index 5932cdf17e..0000000000
--- a/example/rnn/bucket_R/gru.cell.R
+++ /dev/null
@@ -1,54 +0,0 @@
-# GRU cell symbol
-gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0,
- data_masking) {
- i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight,
- bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t",
- seqidx, ".l", layeridx, ".gates.i2h"))
-
- if (dropout > 0)
- i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-
- if (!is.null(prev.state)) {
- h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight,
- bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t",
- seqidx, ".l", layeridx, ".gates.h2h"))
- gates <- i2h + h2h
- } else {
- gates <- i2h
- }
-
- split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F,
- name = paste0("t", seqidx, ".l", layeridx, ".split"))
-
- update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
- reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
-
- htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight,
- bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t",
- seqidx, ".l", layeridx, ".trans.i2h"))
-
- if (is.null(prev.state)) {
- h.after.reset <- reset.gate * 0
- } else {
- h.after.reset <- prev.state$h * reset.gate
- }
-
- htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
- bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t",
- seqidx, ".l", layeridx, ".trans.h2h"))
-
- h.trans <- htrans.i2h + htrans.h2h
- h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
-
- if (is.null(prev.state)) {
- next.h <- update.gate * h.trans.active
- } else {
- next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
- }
-
- ### Add a mask - using the mask_array approach
- data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
- next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-
- return(list(h = next.h))
-}
diff --git a/example/rnn/bucket_R/lstm.cell.R b/example/rnn/bucket_R/lstm.cell.R
deleted file mode 100644
index 3c7b0e456d..0000000000
--- a/example/rnn/bucket_R/lstm.cell.R
+++ /dev/null
@@ -1,41 +0,0 @@
-# LSTM cell symbol
-lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0,
- data_masking) {
- i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias,
- num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
-
- if (dropout > 0)
- i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-
- if (!is.null(prev.state)) {
- h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight,
- bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t",
- seqidx, ".l", layeridx, ".h2h"))
- gates <- i2h + h2h
- } else {
- gates <- i2h
- }
-
- split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F,
- name = paste0("t", seqidx, ".l", layeridx, ".slice"))
-
- in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
- in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
- forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid")
- out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid")
-
- if (is.null(prev.state)) {
- next.c <- in.gate * in.transform
- } else {
- next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
- }
-
- next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
-
- ### Add a mask - using the mask_array approach
- data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
- next.c <- mx.symbol.broadcast_mul(lhs = next.c, rhs = data_mask_expand)
- next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-
- return(list(c = next.c, h = next.h))
-}
diff --git a/example/rnn/bucket_R/mx.io.bucket.iter.R b/example/rnn/bucket_R/mx.io.bucket.iter.R
deleted file mode 100644
index 61f87957ed..0000000000
--- a/example/rnn/bucket_R/mx.io.bucket.iter.R
+++ /dev/null
@@ -1,92 +0,0 @@
-BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "batch.size",
- "data.mask.element", "shuffle", "bucket.plan", "bucketID", "epoch", "batch",
- "batch.per.epoch", "seed"), contains = "Rcpp_MXArrayDataIter", methods = list(initialize = function(buckets,
- batch.size, data.mask.element = 0, shuffle = FALSE, seed = 123) {
- .self$buckets <- buckets
- .self$bucket.names <- names(.self$buckets)
- .self$batch.size <- batch.size
- .self$data.mask.element <- data.mask.element
- .self$epoch <- 0
- .self$batch <- 0
- .self$shuffle <- shuffle
- .self$batch.per.epoch <- 0
- .self$bucket.plan <- NULL
- .self$bucketID <- NULL
- .self$seed <- seed
- .self
-}, reset = function() {
- buckets_nb <- length(bucket.names)
- buckets_id <- 1:buckets_nb
- buckets_size <- sapply(.self$buckets, function(x) {
- dim(x$data)[length(dim(x$data))]
- })
- batch_per_bucket <- floor(buckets_size/.self$batch.size)
- # Number of batches per epoch given the batch_size
- .self$batch.per.epoch <- sum(batch_per_bucket)
- .self$epoch <- .self$epoch + 1
- .self$batch <- 0
-
- if (.self$shuffle) {
- set.seed(.self$seed)
- bucket_plan_names <- sample(rep(names(batch_per_bucket), times = batch_per_bucket))
- .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
- FUN = cumsum)
- names(.self$bucket.plan) <- bucket_plan_names
- ### Return first BucketID at reset for initialization of the model
- .self$bucketID <- .self$bucket.plan[1]
-
- .self$buckets <- lapply(.self$buckets, function(x) {
- shuffle_id <- sample(ncol(x$data))
- if (length(dim(x$label)) == 0) {
- list(data = x$data[, shuffle_id], label = x$label[shuffle_id])
- } else {
- list(data = x$data[, shuffle_id], label = x$label[, shuffle_id])
- }
- })
- } else {
- bucket_plan_names <- rep(names(batch_per_bucket), times = batch_per_bucket)
- .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
- FUN = cumsum)
- names(.self$bucket.plan) <- bucket_plan_names
- }
-}, iter.next = function() {
- .self$batch <- .self$batch + 1
- .self$bucketID <- .self$bucket.plan[batch]
- if (.self$batch > .self$batch.per.epoch) {
- return(FALSE)
- } else {
- return(TRUE)
- }
-}, value = function() {
- # bucketID is a named integer: the integer indicates the batch id for the given
- # bucket (used to fetch appropriate samples within the bucket) the name is the a
- # character containing the sequence length of the bucket (used to unroll the rnn
- # to appropriate sequence length)
- idx <- (.self$bucketID - 1) * (.self$batch.size) + (1:batch.size)
- data <- .self$buckets[[names(.self$bucketID)]]$data[, idx, drop = F]
- data_mask_array <- (!data == 0)
- if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
- label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
- } else {
- label <- .self$buckets[[names(.self$bucketID)]]$label[, idx, drop = F]
- }
- return(list(data = mx.nd.array(data), data.mask.array = mx.nd.array(data_mask_array),
- label = mx.nd.array(label)))
-}, finalize = function() {
-}))
-
-#
-#' Create Bucket Iter
-#'
-#' @param buckets The data array.
-#' @param batch.size The batch size used to pack the array.
-#' @param data.mask.element The element to mask
-#' @param shuffle Whether shuffle the data
-#' @param seed The random seed
-#'
-#' @export
-mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffle = FALSE,
- seed = 123) {
- return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element,
- shuffle = shuffle, seed = seed))
-}
diff --git a/example/rnn/bucket_R/rnn.R b/example/rnn/bucket_R/rnn.R
deleted file mode 100644
index ea02b959a7..0000000000
--- a/example/rnn/bucket_R/rnn.R
+++ /dev/null
@@ -1,208 +0,0 @@
-library(mxnet)
-
-source("lstm.cell.R")
-source("gru.cell.R")
-
-# unrolled RNN network
-rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.embed, num.hidden,
- num.label, dropout = 0, ignore_label = 0, init.state = NULL, config, cell.type = "lstm",
- output_last_state = F) {
- embed.weight <- mx.symbol.Variable("embed.weight")
- cls.weight <- mx.symbol.Variable("cls.weight")
- cls.bias <- mx.symbol.Variable("cls.bias")
-
- param.cells <- lapply(1:num.rnn.layer, function(i) {
- if (cell.type == "lstm") {
- cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
- i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), h2h.weight = mx.symbol.Variable(paste0("l",
- i, ".h2h.weight")), h2h.bias = mx.symbol.Variable(paste0("l", i,
- ".h2h.bias")))
- } else if (cell.type == "gru") {
- cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
- gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
- gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
- gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
- trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
- trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
- trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
- trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
- }
- return(cell)
- })
-
- # embeding layer
- label <- mx.symbol.Variable("label")
- data <- mx.symbol.Variable("data")
- data_mask_array <- mx.symbol.Variable("data.mask.array")
- data_mask_array <- mx.symbol.stop_gradient(data_mask_array, name = "data.mask.array")
-
- embed <- mx.symbol.Embedding(data = data, input_dim = input.size, weight = embed.weight,
- output_dim = num.embed, name = "embed")
-
- wordvec <- mx.symbol.split(data = embed, axis = 1, num.outputs = seq.len, squeeze_axis = T)
- data_mask_split <- mx.symbol.split(data = data_mask_array, axis = 1, num.outputs = seq.len,
- squeeze_axis = T)
-
- last.hidden <- list()
- last.states <- list()
- decode <- list()
- softmax <- list()
- fc <- list()
-
- for (seqidx in 1:seq.len) {
- hidden <- wordvec[[seqidx]]
-
- for (i in 1:num.rnn.layer) {
- if (seqidx == 1) {
- prev.state <- init.state[[i]]
- } else {
- prev.state <- last.states[[i]]
- }
-
- if (cell.type == "lstm") {
- cell.symbol <- lstm.cell
- } else if (cell.type == "gru") {
- cell.symbol <- gru.cell
- }
-
- next.state <- cell.symbol(num.hidden = num.hidden, indata = hidden, prev.state = prev.state,
- param = param.cells[[i]], seqidx = seqidx, layeridx = i, dropout = dropout,
- data_masking = data_mask_split[[seqidx]])
- hidden <- next.state$h
- # if (dropout > 0) hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
- last.states[[i]] <- next.state
- }
-
- # Decoding
- if (config == "one-to-one") {
- last.hidden <- c(last.hidden, hidden)
- }
- }
-
- if (config == "seq-to-one") {
- fc <- mx.symbol.FullyConnected(data = hidden, weight = cls.weight, bias = cls.bias,
- num.hidden = num.label)
-
- loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-
- } else if (config == "one-to-one") {
- last.hidden_expand <- lapply(last.hidden, function(i) mx.symbol.expand_dims(i,
- axis = 1))
- concat <- mx.symbol.concat(last.hidden_expand, num.args = seq.len, dim = 1)
- reshape <- mx.symbol.Reshape(concat, shape = c(num.hidden, -1))
-
- fc <- mx.symbol.FullyConnected(data = reshape, weight = cls.weight, bias = cls.bias,
- num.hidden = num.label)
-
- label <- mx.symbol.reshape(data = label, shape = c(-1))
- loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-
- }
-
- if (output_last_state) {
- group <- mx.symbol.Group(c(unlist(last.states), loss))
- return(group)
- } else {
- return(loss)
- }
-}
-
-########################################### mx.rnn.buckets
-mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidden,
- num.embed, num.label, input.size, ctx = NULL, num.round = 1, initializer = mx.init.uniform(0.01),
- dropout = 0, config = "one-to-one", optimizer = "sgd", batch.end.callback = NULL,
- epoch.end.callback = NULL, begin.round = 1, metric = mx.metric.rmse, cell.type = "lstm",
- kvstore = "local", verbose = FALSE) {
-
- if (!train.data$iter.next()) {
- train.data$reset()
- if (!train.data$iter.next())
- stop("Empty train.data")
- }
-
- if (!is.null(eval.data)) {
- if (!eval.data$iter.next()) {
- eval.data$reset()
- if (!eval.data$iter.next())
- stop("Empty eval.data")
- }
- }
-
- if (is.null(ctx))
- ctx <- mx.ctx.default()
- if (is.mx.context(ctx)) {
- ctx <- list(ctx)
- }
- if (!is.list(ctx))
- stop("ctx must be mx.context or list of mx.context")
- if (is.character(optimizer)) {
- if (is.numeric(input.shape)) {
- ndim <- length(input.shape)
- batchsize <- input.shape[[ndim]]
- } else {
- ndim <- length(input.shape[[1]])
- batchsize <- input.shape[[1]][[ndim]]
- }
- optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
- }
-
- # get unrolled lstm symbol
- sym_list <- sapply(train.data$bucket.names, function(x) {
- rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x),
- input.size = input.size, num.embed = num.embed, num.label = num.label,
- dropout = dropout, cell.type = cell.type, config = config)
- }, simplify = F, USE.NAMES = T)
-
- # setup lstm model
- symbol <- sym_list[[names(train.data$bucketID)]]
-
- arg.names <- symbol$arguments
- input.names <- c("data", "data.mask.array")
- input.shape <- sapply(input.names, function(n) {
- dim(train.data$value()[[n]])
- }, simplify = FALSE)
- output.names <- "label"
- output.shape <- sapply(output.names, function(n) {
- dim(train.data$value()[[n]])
- }, simplify = FALSE)
-
- params <- mx.model.init.params(symbol, input.shape, output.shape, initializer,
- mx.cpu())
-
- kvstore <- mxnet:::mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
- verbose = verbose)
-
- ### Execute training - rnn.model.R
- model <- mx.model.train.rnn.buckets(sym_list = sym_list, input.shape = input.shape,
- output.shape = output.shape, arg.params = params$arg.params, aux.params = params$aux.params,
- optimizer = optimizer, train.data = train.data, eval.data = eval.data, verbose = verbose,
- begin.round = begin.round, end.round = num.round, metric = metric, ctx = ctx,
- batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback,
- kvstore = kvstore)
-
- return(model)
-}
-
-
-# get the argument name of data and label
-mx.model.check.arguments <- function(symbol) {
- data <- NULL
- label <- NULL
- for (nm in arguments(symbol)) {
- if (mx.util.str.endswith(nm, "data")) {
- if (!is.null(data)) {
- stop("Multiple fields contains suffix data")
- } else {
- data <- nm
- }
- }
- if (mx.util.str.endswith(nm, "label")) {
- if (!is.null(label)) {
- stop("Multiple fields contains suffix label")
- } else {
- label <- nm
- }
- }
- }
- return(c(data, label))
-}
diff --git a/example/rnn/bucket_R/rnn.infer.R b/example/rnn/bucket_R/rnn.infer.R
deleted file mode 100644
index 41488aac89..0000000000
--- a/example/rnn/bucket_R/rnn.infer.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(mxnet)
-
-source("rnn.R")
-
-mx.rnn.infer.buckets <- function(infer_iter, model, config, ctx = mx.cpu(), output_last_state = FALSE,
- init.state = NULL, cell.type = "lstm") {
- ### Infer parameters from model
- if (cell.type == "lstm") {
- num.rnn.layer <- round((length(model$arg.params) - 3)/4)
- num.hidden <- dim(model$arg.params$l1.h2h.weight)[1]
- } else if (cell.type == "gru") {
- num.rnn.layer <- round((length(model$arg.params) - 3)/8)
- num.hidden <- dim(model$arg.params$l1.gates.h2h.weight)[1]
- }
-
- input.size <- dim(model$arg.params$embed.weight)[2]
- num.embed <- dim(model$arg.params$embed.weight)[1]
- num.label <- dim(model$arg.params$cls.bias)
-
- ### Initialise the iterator
- infer_iter$reset()
- infer_iter$iter.next()
- batch_size <- infer_iter$batch.size
-
- # get unrolled lstm symbol
- sym_list <- sapply(infer_iter$bucket.names, function(x) {
- rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x),
- input.size = input.size, num.embed = num.embed, num.label = num.label,
- config = config, dropout = 0, init.state = init.state, cell.type = cell.type,
- output_last_state = output_last_state)
- }, simplify = F, USE.NAMES = T)
-
- symbol <- sym_list[[names(infer_iter$bucketID)]]
-
- input.shape <- lapply(infer_iter$value(), dim)
- input.shape <- input.shape[names(input.shape) %in% arguments(symbol)]
-
- infer_shapes <- symbol$infer.shape(input.shape)
- arg.params <- model$arg.params
- aux.params <- model$aux.params
-
- input.names <- names(input.shape)
- arg.names <- names(arg.params)
-
- # Grad request
- grad_req <- rep("null", length(symbol$arguments))
-
- # Arg array order
- update_names <- c(input.names, arg.names)
- arg_update_idx <- match(symbol$arguments, update_names)
-
- # Initial input shapes - need to be adapted for multi-devices - divide highest
- # dimension by device nb
- s <- sapply(input.shape, function(shape) {
- mx.nd.zeros(shape = shape, ctx = mx.cpu())
- })
-
- train.execs <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx],
- aux.arrays = aux.params, ctx = ctx, grad.req = grad_req)
-
- packer <- mxnet:::mx.nd.arraypacker()
- infer_iter$reset()
- while (infer_iter$iter.next()) {
- # Get input data slice
- dlist <- infer_iter$value()[input.names]
-
- symbol <- sym_list[[names(infer_iter$bucketID)]]
-
- texec <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, train.execs$arg.arrays[arg.names])[arg_update_idx],
- aux.arrays = train.execs$aux.arrays, ctx = ctx, grad.req = grad_req)
-
- mx.exec.forward(texec, is.train = FALSE)
-
- out.preds <- mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
- packer$push(out.preds)
- }
- infer_iter$reset()
- return(packer$get())
-}
diff --git a/example/rnn/bucket_R/rnn.train.R b/example/rnn/bucket_R/rnn.train.R
deleted file mode 100644
index b833b2b1d3..0000000000
--- a/example/rnn/bucket_R/rnn.train.R
+++ /dev/null
@@ -1,206 +0,0 @@
-library(mxnet)
-
-source("rnn.R")
-
-# Internal function to do multiple device training on RNN
-mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, input.shape,
- output.shape, begin.round, end.round, optimizer, train.data, eval.data, metric,
- epoch.end.callback, batch.end.callback, kvstore, verbose = TRUE) {
- symbol <- sym_list[[names(train.data$bucketID)]]
-
- input.names <- names(input.shape)
- output.names <- names(output.shape)
- arg.names <- names(arg.params)
-
- ndevice <- length(ctx)
- if (verbose)
- message(paste0("Start training with ", ndevice, " devices"))
- input_slice <- mxnet:::mx.model.slice.shape(input.shape, ndevice)
- output_slice <- mxnet:::mx.model.slice.shape(output.shape, ndevice)
-
-
- # Grad request
- grad_req <- rep("write", length(symbol$arguments))
- # grad_null_idx <- match(c(input.names, output.names), symbol$arguments)
- grad_null_idx <- match(input.names, symbol$arguments)
- grad_req[grad_null_idx] <- "null"
-
- # Arg array order
- update_names <- c(input.names, output.names, arg.names)
- arg_update_idx <- match(symbol$arguments, update_names)
-
- train.execs <- lapply(1:ndevice, function(i) {
- s <- sapply(append(input_slice[[i]]$shape, output_slice[[i]]$shape), function(shape) {
- mx.nd.zeros(shape = shape, ctx = mx.cpu())
- })
- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx],
- aux.arrays = aux.params, ctx = mx.cpu(), grad.req = grad_req)
- })
-
- # KVStore related stuffs
- params.index <- as.integer(mxnet:::mx.util.filter.null(lapply(1:length(train.execs[[1]]$ref.grad.arrays),
- function(k) {
- if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL
- })))
- update.on.kvstore <- FALSE
- if (!is.null(kvstore) && kvstore$update.on.kvstore) {
- update.on.kvstore <- TRUE
- kvstore$set.optimizer(optimizer)
- } else {
- updaters <- lapply(1:ndevice, function(i) {
- mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
- })
- }
-
- if (!is.null(kvstore)) {
- kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
- }
-
- for (iteration in begin.round:end.round) {
- nbatch <- 0
- if (!is.null(metric)) {
- train.metric <- metric$init()
- }
- train.data$reset()
- while (train.data$iter.next()) {
- dlist <- train.data$value() #[input.names]
- symbol <- sym_list[[names(train.data$bucketID)]]
- slices <- lapply(1:ndevice, function(i) {
- s <- input_slice[[i]]
- ret <- sapply(names(dlist), function(n) {
- mxnet:::mx.nd.slice(dlist[[n]], s$begin, s$end)
- })
- return(ret)
- })
-
- train.execs <- lapply(1:ndevice, function(i) {
- s <- slices[[i]]
- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx],
- aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
- })
-
- for (texec in train.execs) {
- mx.exec.forward(texec, is.train = TRUE)
- }
-
- out.preds <- lapply(train.execs, function(texec) {
- mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
- })
-
- for (texec in train.execs) {
- mx.exec.backward(texec)
- }
-
- if (!is.null(kvstore)) {
- # push the gradient
- kvstore$push(params.index, lapply(train.execs, function(texec) {
- texec$ref.grad.arrays[params.index]
- }), -params.index)
- }
- if (update.on.kvstore) {
- # pull back weight
- kvstore$pull(params.index, lapply(train.execs, function(texec) {
- texec$ref.arg.arrays[params.index]
- }), -params.index)
- } else {
- # pull back gradient sums
- if (!is.null(kvstore)) {
- kvstore$pull(params.index, lapply(train.execs, function(texec) {
- texec$ref.grad.arrays[params.index]
- }), -params.index)
- }
- arg.blocks <- lapply(1:ndevice, function(i) {
- updaters[[i]](train.execs[[i]]$ref.arg.arrays, train.execs[[i]]$ref.grad.arrays)
- })
- for (i in 1:ndevice) {
- mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
- }
- }
-
- # Update the evaluation metrics
- if (!is.null(metric)) {
- # train.metric <- metric$update(dlist$label, out.preds, train.metric)
- for (i in 1:ndevice) {
- train.metric <- metric$update(slices[[i]][[length(slices[[i]])]],
- out.preds[[i]], train.metric)
- }
- }
-
- nbatch <- nbatch + 1
-
- if (!is.null(batch.end.callback)) {
- batch.end.callback(iteration, nbatch, environment())
- }
- }
-
- if (!is.null(metric)) {
- result <- metric$get(train.metric)
- if (verbose)
- message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
- }
-
- if (!is.null(eval.data)) {
- if (!is.null(metric)) {
- eval.metric <- metric$init()
- }
- eval.data$reset()
- while (eval.data$iter.next()) {
- # Get input data slice
- dlist <- eval.data$value() #[input.names]
- symbol <- sym_list[[names(eval.data$bucketID)]]
- slices <- lapply(1:ndevice, function(i) {
- s <- input_slice[[i]]
- ret <- sapply(names(dlist), function(n) {
- mxnet:::mx.nd.slice(dlist[[n]], s$begin, s$end)
- })
- return(ret)
- })
-
-
- train.execs <- lapply(1:ndevice, function(i) {
- s <- slices[[i]]
- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx],
- aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
- })
-
- for (texec in train.execs) {
- mx.exec.forward(texec, is.train = FALSE)
- }
-
- # copy outputs to CPU
- out.preds <- lapply(train.execs, function(texec) {
- mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
- })
-
- if (!is.null(metric)) {
- for (i in 1:ndevice) {
- eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]],
- out.preds[[i]], eval.metric)
- }
- }
- }
-
- if (!is.null(metric)) {
- result <- metric$get(eval.metric)
- if (verbose) {
- message(paste0("[", iteration, "] Validation-", result$name, "=",
- result$value))
- }
- }
- } else {
- eval.metric <- NULL
- }
- # get the model out
- model <- mxnet:::mx.model.extract.model(symbol, train.execs)
-
- epoch_continue <- TRUE
- if (!is.null(epoch.end.callback)) {
- epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
- }
-
- if (!epoch_continue) {
- break
- }
- }
- return(model)
-}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
With regards,
Apache Git Services