You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by sk...@apache.org on 2018/09/28 18:58:49 UTC
[incubator-mxnet] branch master updated: [MXNET-833] [R] Char-level
RNN tutorial fix (#12670)
This is an automated email from the ASF dual-hosted git repository.
skm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new ca81535 [MXNET-833] [R] Char-level RNN tutorial fix (#12670)
ca81535 is described below
commit ca81535ee75e964a1a6e0905567e6872f86db830
Author: Anirudh <an...@gmail.com>
AuthorDate: Fri Sep 28 11:58:37 2018 -0700
[MXNET-833] [R] Char-level RNN tutorial fix (#12670)
* char RNN tutorial
* nit fixes
---
R-package/vignettes/CharRnnModel.Rmd | 465 ++++++++++++++-------------
docs/tutorials/r/charRnnModel.md | 593 +++++++++++++++++------------------
2 files changed, 529 insertions(+), 529 deletions(-)
diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
index 9d3fd5c..3c302bb 100644
--- a/R-package/vignettes/CharRnnModel.Rmd
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -1,32 +1,28 @@
-# Char RNN Example
+# Character-level Language Model using RNN
-## Load Data
+This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
+ - readr
+ - stringr
+ - stringi
+ - mxnet
-First of all, load in the data and preprocess it.
+We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.
-```{r}
-require(mxnet)
-```
-Set basic network parameters.
-
-```{r}
-batch.size = 32
-seq.len = 32
-num.hidden = 16
-num.embed = 16
-num.lstm.layer = 1
-num.round = 1
-learning.rate= 0.1
-wd=0.00001
-clip_gradient=1
-update.period = 1
+```R
+library("readr")
+library("stringr")
+library("stringi")
+library("mxnet")
```
-Download the data.
+## Preprocess and prepare the data
+
+Download the data:
+
-```{r}
+```R
download.data <- function(data_dir) {
dir.create(data_dir, showWarnings = FALSE)
if (!file.exists(paste0(data_dir,'input.txt'))) {
@@ -36,237 +32,262 @@ download.data <- function(data_dir) {
}
```
-Make dictionary from text.
-
-```{r}
-make.dict <- function(text, max.vocab=10000) {
- text <- strsplit(text, '')
- dic <- list()
- idx <- 1
- for (c in text[[1]]) {
- if (!(c %in% names(dic))) {
- dic[[c]] <- idx
- idx <- idx + 1
- }
- }
- if (length(dic) == max.vocab - 1)
- dic[["UNKNOWN"]] <- idx
- cat(paste0("Total unique char: ", length(dic), "\n"))
- return (dic)
-}
-```
-
-Transfer text into data feature.
-
-```{r}
-make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
- fi <- file(file.path, "r")
- text <- paste(readLines(fi), collapse="\n")
- close(fi)
-
- if (is.null(dic))
- dic <- make.dict(text, max.vocab)
- lookup.table <- list()
- for (c in names(dic)) {
- idx <- dic[[c]]
- lookup.table[[idx]] <- c
- }
-
- char.lst <- strsplit(text, '')[[1]]
- num.seq <- as.integer(length(char.lst) / seq.len)
- char.lst <- char.lst[1:(num.seq * seq.len)]
- data <- array(0, dim=c(seq.len, num.seq))
- idx <- 1
- for (i in 1:num.seq) {
- for (j in 1:seq.len) {
- if (char.lst[idx] %in% names(dic))
- data[j, i] <- dic[[ char.lst[idx] ]]-1
- else {
- data[j, i] <- dic[["UNKNOWN"]]-1
- }
- idx <- idx + 1
- }
- }
- return (list(data=data, dic=dic, lookup.table=lookup.table))
+Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len = 32, dic=NULL) {
+
+ text_vec <- read_file(file = path)
+ text_vec <- stri_enc_toascii(str = text_vec)
+ text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
+ text_vec <- strsplit(text_vec, '') %>% unlist
+
+ if (is.null(dic)) {
+ char_keep <- sort(unique(text_vec))
+ } else char_keep <- names(dic)[!dic == 0]
+
+ # Remove terms not part of dictionary
+ text_vec <- text_vec[text_vec %in% char_keep]
+
+ # Build dictionary
+ dic <- 1:length(char_keep)
+ names(dic) <- char_keep
+
+ # reverse dictionary
+ rev_dic <- names(dic)
+ names(rev_dic) <- dic
+
+ # Adjust by -1 to have a 1-lag for labels
+ num.seq <- (length(text_vec) - 1) %/% seq.len
+
+ features <- dic[text_vec[1:(seq.len * num.seq)]]
+ labels <- dic[text_vec[1:(seq.len * num.seq)+1]]
+
+ features_array <- array(features, dim = c(seq.len, num.seq))
+ labels_array <- array(labels, dim = c(seq.len, num.seq))
+
+ return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
}
-```
-Move tail text.
-
-```{r}
-drop.tail <- function(X, batch.size) {
- shape <- dim(X)
- nstep <- as.integer(shape[2] / batch.size)
- return (X[, 1:(nstep * batch.size)])
-}
-```
-Get the label of X
-
-```{r}
-get.label <- function(X) {
- label <- array(0, dim=dim(X))
- d <- dim(X)[1]
- w <- dim(X)[2]
- for (i in 0:(w-1)) {
- for (j in 1:d) {
- label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
- }
- }
- return (label)
-}
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic = NULL)
```
-Get training data and eval data
+Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
-```{r}
-download.data("./data/")
-ret <- make.data("./data/input.txt", seq.len=seq.len)
-X <- ret$data
-dic <- ret$dic
-lookup.table <- ret$lookup.table
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
vocab <- length(dic)
-shape <- dim(X)
+samples <- tail(dim(X), 1)
train.val.fraction <- 0.9
-size <- shape[2]
-X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
-X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
-X.train.data <- drop.tail(X.train.data, batch.size)
-X.val.data <- drop.tail(X.val.data, batch.size)
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
-X.train.label <- get.label(X.train.data)
-X.val.label <- get.label(X.val.data)
-
-X.train <- list(data=X.train.data, label=X.train.label)
-X.val <- list(data=X.val.data, label=X.val.label)
-```
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
-## Training Model
-
-
-In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model.
-
-```{r}
-model <- mx.lstm(X.train, X.val,
- ctx=mx.cpu(),
- num.round=num.round,
- update.period=update.period,
- num.lstm.layer=num.lstm.layer,
- seq.len=seq.len,
- num.hidden=num.hidden,
- num.embed=num.embed,
- num.label=vocab,
- batch.size=batch.size,
- input.size=vocab,
- initializer=mx.init.uniform(0.1),
- learning.rate=learning.rate,
- wd=wd,
- clip_gradient=clip_gradient)
+train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
+eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
```
-## Inference from model
+Create iterators for training and evaluation datasets.
-Some helper functions for random sample.
+```R
+vocab <- length(eval_buckets$dic)
-```{r}
-cdf <- function(weights) {
- total <- sum(weights)
- result <- c()
- cumsum <- 0
- for (w in weights) {
- cumsum <- cumsum+w
- result <- c(result, cumsum / total)
- }
- return (result)
-}
+batch.size <- 32
-search.val <- function(cdf, x) {
- l <- 1
- r <- length(cdf)
- while (l <= r) {
- m <- as.integer((l+r)/2)
- if (cdf[m] < x) {
- l <- m+1
- } else {
- r <- m-1
- }
- }
- return (l)
-}
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = TRUE)
-choice <- function(weights) {
- cdf.vals <- cdf(as.array(weights))
- x <- runif(1)
- idx <- search.val(cdf.vals, x)
- return (idx)
-}
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = FALSE)
```
-We can use random output or fixed output by choosing largest probability.
-
-```{r}
-make.output <- function(prob, sample=FALSE) {
- if (!sample) {
- idx <- which.max(as.array(prob))
- }
- else {
- idx <- choice(prob)
- }
- return (idx)
-
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
+
+
+```R
+rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
+ num_hidden = 96,
+ input_size = vocab,
+ num_embed = 64,
+ num_decode =vocab,
+ dropout = 0.2,
+ ignore_label = 0,
+ cell_type = "lstm",
+ masking = F,
+ output_last_state = T,
+ loss_output = "softmax",
+ config = "one-to-one")
+
+graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
+ graph.height.px = 180, shape=c(100, 64))
+
+devices <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
+ clip_gradient = 5, rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
+batch.end.callback <- mx.callback.log.train.metric(period = 50)
+
+mx.metric.custom_nd <- function(name, feval) {
+ init <- function() {
+ c(0, 0)
+ }
+ update <- function(label, pred, state) {
+ m <- feval(label, pred)
+ state <- c(state[[1]] + 1, state[[2]] + m)
+ return(state)
+ }
+ get <- function(state) {
+ list(name = name, value = (state[[2]]/state[[1]]))
+ }
+ ret <- (list(init = init, update = update, get = get))
+ class(ret) <- "mx.metric"
+ return(ret)
}
-```
-In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
-
-```{r}
-infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
- input.size=vocab,
- num.hidden=num.hidden,
- num.embed=num.embed,
- num.label=vocab,
- arg.params=model$arg.params,
- ctx=mx.cpu())
+mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
+ label <- mx.nd.reshape(label, shape = -1)
+ label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
+ batch <- length(label_probs)
+ NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
+ Perplexity <- exp(NLL)
+ return(Perplexity)
+})
+
+model <- mx.model.buckets(symbol = rnn_graph_one_one,
+ train.data = train.data, eval.data = eval.data,
+ num.round = 20, ctx = devices, verbose = TRUE,
+ metric = mx.metric.Perplexity,
+ initializer = initializer, optimizer = optimizer,
+ batch.end.callback = NULL,
+ epoch.end.callback = epoch.end.callback)
+
+mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
```
-Generate a sequence of 75 chars using function `mx.lstm.forward`.
-
-```{r}
-start <- 'a'
-seq.len <- 75
-random.sample <- TRUE
-
-last.id <- dic[[start]]
-out <- "a"
-for (i in (1:(seq.len-1))) {
- input <- c(last.id-1)
- ret <- mx.lstm.forward(infer.model, input, FALSE)
- infer.model <- ret$model
- prob <- ret$prob
- last.id <- make.output(prob, random.sample)
- out <- paste0(out, lookup.table[[last.id]])
+ Start training with 1 devices
+ [1] Train-Perplexity=13.7040474322178
+ [1] Validation-Perplexity=7.94617194460922
+ [2] Train-Perplexity=6.57039815554525
+ [2] Validation-Perplexity=6.60806110658011
+ [3] Train-Perplexity=5.65360504501481
+ [3] Validation-Perplexity=6.18932770630876
+ [4] Train-Perplexity=5.32547285727298
+ [4] Validation-Perplexity=6.02198756798859
+ [5] Train-Perplexity=5.14373631472579
+ [5] Validation-Perplexity=5.8095658243407
+ [6] Train-Perplexity=5.03077673487379
+ [6] Validation-Perplexity=5.72582993567431
+ [7] Train-Perplexity=4.94453383291536
+ [7] Validation-Perplexity=5.6445258528126
+ [8] Train-Perplexity=4.88635290100261
+ [8] Validation-Perplexity=5.6730024536433
+ [9] Train-Perplexity=4.84205646230548
+ [9] Validation-Perplexity=5.50960780230982
+ [10] Train-Perplexity=4.80441673535513
+ [10] Validation-Perplexity=5.57002263750006
+ [11] Train-Perplexity=4.77763413242626
+ [11] Validation-Perplexity=5.55152143269169
+ [12] Train-Perplexity=4.74937775290777
+ [12] Validation-Perplexity=5.44968305351486
+ [13] Train-Perplexity=4.72824849541467
+ [13] Validation-Perplexity=5.50889348298234
+ [14] Train-Perplexity=4.70980846981694
+ [14] Validation-Perplexity=5.51473225859859
+ [15] Train-Perplexity=4.69685776886122
+ [15] Validation-Perplexity=5.45391985233811
+ [16] Train-Perplexity=4.67837107034824
+ [16] Validation-Perplexity=5.46636764997829
+ [17] Train-Perplexity=4.66866961934873
+ [17] Validation-Perplexity=5.44267086113492
+ [18] Train-Perplexity=4.65611469144194
+ [18] Validation-Perplexity=5.4290169469462
+ [19] Train-Perplexity=4.64614689879405
+ [19] Validation-Perplexity=5.44221549833917
+ [20] Train-Perplexity=4.63764001963654
+ [20] Validation-Perplexity=5.42114250842862
+
+
+## Inference on the Model
+
+We now use the saved model to do inference and sample text character by character that will look like the original training data.
+
+
+```R
+set.seed(0)
+model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
+
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+infer_raw <- c("Thou ")
+infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
+infer_length <- length(infer_split)
+
+infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
+ batch.size = 1, shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = NULL,
+ ctx = devices)
+
+pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
+ infer$loss_output, axis=0, begin = infer_length-1, end = infer_length)))
+pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
+predict <- c(predict, pred)
+
+for (i in 1:200) {
+
+ infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
+ batch.size = 1, shuffle = FALSE)
+
+ infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = list(rnn.state = infer[[2]],
+ rnn.state.cell = infer[[3]]),
+ ctx = devices)
+
+ pred_prob <- as.numeric(as.array(infer$loss_output))
+ pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
+ predict <- c(predict, pred)
}
-message(out)
-```
-The result:
+predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
+predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
+print(predict_txt_tot)
```
-ah not a drobl greens
-Settled asing lately sistering sounted to their hight
-```
-
-## Other RNN models
-
-In `mxnet`, other RNN models like custom RNN and gru is also provided.
-- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
-
-- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.
+ [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/r/charRnnModel.md b/docs/tutorials/r/charRnnModel.md
index 82e10a1..cb21e77 100644
--- a/docs/tutorials/r/charRnnModel.md
+++ b/docs/tutorials/r/charRnnModel.md
@@ -1,314 +1,293 @@
-Char RNN Example
-=============================================
-
-This tutorial shows how to use an LSTM model to build a char-level language model, and generate text from it. For demonstration purposes, we use a Shakespearean text. You can find the data on [GitHub](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
-
-Load the Data
----------
-Load in the data and preprocess it:
-
- ```r
- require(mxnet)
- ```
-
- ```
- ## Loading required package: mxnet
- ```
-
- ```
- ## Loading required package: methods
- ```
-Set the basic network parameters:
-
- ```r
- batch.size = 32
- seq.len = 32
- num.hidden = 16
- num.embed = 16
- num.lstm.layer = 1
- num.round = 1
- learning.rate= 0.1
- wd=0.00001
- clip_gradient=1
- update.period = 1
- ```
-Download the data:
- ```r
- download.data <- function(data_dir) {
- dir.create(data_dir, showWarnings = FALSE)
- if (!file.exists(paste0(data_dir,'input.txt'))) {
- download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
- destfile=paste0(data_dir,'input.txt'), method='wget')
- }
- }
- ```
-Make a dictionary from the text:
-
- ```r
- make.dict <- function(text, max.vocab=10000) {
- text <- strsplit(text, '')
- dic <- list()
- idx <- 1
- for (c in text[[1]]) {
- if (!(c %in% names(dic))) {
- dic[[c]] <- idx
- idx <- idx + 1
- }
- }
- if (length(dic) == max.vocab - 1)
- dic[["UNKNOWN"]] <- idx
- cat(paste0("Total unique char: ", length(dic), "\n"))
- return (dic)
- }
- ```
-Transfer the text into a data feature:
-
- ```r
- make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
- fi <- file(file.path, "r")
- text <- paste(readLines(fi), collapse="\n")
- close(fi)
-
- if (is.null(dic))
- dic <- make.dict(text, max.vocab)
- lookup.table <- list()
- for (c in names(dic)) {
- idx <- dic[[c]]
- lookup.table[[idx]] <- c
- }
-
- char.lst <- strsplit(text, '')[[1]]
- num.seq <- as.integer(length(char.lst) / seq.len)
- char.lst <- char.lst[1:(num.seq * seq.len)]
- data <- array(0, dim=c(seq.len, num.seq))
- idx <- 1
- for (i in 1:num.seq) {
- for (j in 1:seq.len) {
- if (char.lst[idx] %in% names(dic))
- data[j, i] <- dic[[ char.lst[idx] ]]-1
- else {
- data[j, i] <- dic[["UNKNOWN"]]-1
- }
- idx <- idx + 1
- }
- }
- return (list(data=data, dic=dic, lookup.table=lookup.table))
- }
- ```
-Move the tail text:
-
- ```r
- drop.tail <- function(X, batch.size) {
- shape <- dim(X)
- nstep <- as.integer(shape[2] / batch.size)
- return (X[, 1:(nstep * batch.size)])
- }
- ```
-Get the label of X:
-
- ```r
- get.label <- function(X) {
- label <- array(0, dim=dim(X))
- d <- dim(X)[1]
- w <- dim(X)[2]
- for (i in 0:(w-1)) {
- for (j in 1:d) {
- label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
- }
- }
- return (label)
- }
- ```
-Get the training data and evaluation data:
-
- ```r
- download.data("./data/")
- ret <- make.data("./data/input.txt", seq.len=seq.len)
- ```
-
- ```
- ## Total unique char: 65
- ```
-
- ```r
- X <- ret$data
- dic <- ret$dic
- lookup.table <- ret$lookup.table
-
- vocab <- length(dic)
-
- shape <- dim(X)
- train.val.fraction <- 0.9
- size <- shape[2]
-
- X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
- X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
- X.train.data <- drop.tail(X.train.data, batch.size)
- X.val.data <- drop.tail(X.val.data, batch.size)
-
- X.train.label <- get.label(X.train.data)
- X.val.label <- get.label(X.val.data)
-
- X.train <- list(data=X.train.data, label=X.train.label)
- X.val <- list(data=X.val.data, label=X.val.label)
- ```
-
-Train the Model
---------------
-In `mxnet`, we have a function called `mx.lstm` so that users can build a general LSTM model:
-
-
- ```r
- model <- mx.lstm(X.train, X.val,
- ctx=mx.cpu(),
- num.round=num.round,
- update.period=update.period,
- num.lstm.layer=num.lstm.layer,
- seq.len=seq.len,
- num.hidden=num.hidden,
- num.embed=num.embed,
- num.label=vocab,
- batch.size=batch.size,
- input.size=vocab,
- initializer=mx.init.uniform(0.1),
- learning.rate=learning.rate,
- wd=wd,
- clip_gradient=clip_gradient)
- ```
-
- ```
- ## Epoch [31] Train: NLL=3.53787130224343, Perp=34.3936275728271
- ## Epoch [62] Train: NLL=3.43087958036949, Perp=30.903813186055
- ## Epoch [93] Train: NLL=3.39771238228587, Perp=29.8956319855751
- ## Epoch [124] Train: NLL=3.37581711716687, Perp=29.2481732041015
- ## Epoch [155] Train: NLL=3.34523331338447, Perp=28.3671933405139
- ## Epoch [186] Train: NLL=3.30756356274787, Perp=27.31848454823
- ## Epoch [217] Train: NLL=3.25642968403829, Perp=25.9566978956055
- ## Epoch [248] Train: NLL=3.19825967486207, Perp=24.4898727477925
- ## Epoch [279] Train: NLL=3.14013971549828, Perp=23.1070950525017
- ## Epoch [310] Train: NLL=3.08747601837462, Perp=21.9216781782189
- ## Epoch [341] Train: NLL=3.04015595674863, Perp=20.9085038031042
- ## Epoch [372] Train: NLL=2.99839339255659, Perp=20.0532932584534
- ## Epoch [403] Train: NLL=2.95940091012609, Perp=19.2864139984503
- ## Epoch [434] Train: NLL=2.92603311380224, Perp=18.6534872738302
- ## Epoch [465] Train: NLL=2.89482756896395, Perp=18.0803835531869
- ## Epoch [496] Train: NLL=2.86668230478397, Perp=17.5786009078994
- ## Epoch [527] Train: NLL=2.84089368534943, Perp=17.1310684830416
- ## Epoch [558] Train: NLL=2.81725862932279, Perp=16.7309220880514
- ## Epoch [589] Train: NLL=2.79518870141492, Perp=16.3657166956952
- ## Epoch [620] Train: NLL=2.77445683225304, Perp=16.0299176962855
- ## Epoch [651] Train: NLL=2.75490970113174, Perp=15.719621374694
- ## Epoch [682] Train: NLL=2.73697900634351, Perp=15.4402696117257
- ## Epoch [713] Train: NLL=2.72059739336781, Perp=15.1893935780915
- ## Epoch [744] Train: NLL=2.70462837571585, Perp=14.948760335793
- ## Epoch [775] Train: NLL=2.68909904683828, Perp=14.7184093476224
- ## Epoch [806] Train: NLL=2.67460054451836, Perp=14.5065539595711
- ## Epoch [837] Train: NLL=2.66078997776751, Perp=14.3075873113043
- ## Epoch [868] Train: NLL=2.6476781639279, Perp=14.1212134100373
- ## Epoch [899] Train: NLL=2.63529039846876, Perp=13.9473621677371
- ## Epoch [930] Train: NLL=2.62367693518974, Perp=13.7863219168709
- ## Epoch [961] Train: NLL=2.61238282674384, Perp=13.6314936713501
- ## Iter [1] Train: Time: 10301.6818172932 sec, NLL=2.60536539345356, Perp=13.5361704272949
- ## Iter [1] Val: NLL=2.26093848746227, Perp=9.59208699731232
- ```
-
-Build Inference from the Model
---------------------
-Use the helper function for random sample:
-
- ```r
- cdf <- function(weights) {
- total <- sum(weights)
- result <- c()
- cumsum <- 0
- for (w in weights) {
- cumsum <- cumsum+w
- result <- c(result, cumsum / total)
- }
- return (result)
- }
+# Character-level Language Model using RNN
- search.val <- function(cdf, x) {
- l <- 1
- r <- length(cdf)
- while (l <= r) {
- m <- as.integer((l+r)/2)
- if (cdf[m] < x) {
- l <- m+1
- } else {
- r <- m-1
- }
- }
- return (l)
- }
- choice <- function(weights) {
- cdf.vals <- cdf(as.array(weights))
- x <- runif(1)
- idx <- search.val(cdf.vals, x)
- return (idx)
- }
- ```
-Use random output or fixed output by choosing the greatest probability:
-
- ```r
- make.output <- function(prob, sample=FALSE) {
- if (!sample) {
- idx <- which.max(as.array(prob))
- }
- else {
- idx <- choice(prob)
- }
- return (idx)
+This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
+ - readr
+ - stringr
+ - stringi
+ - mxnet
- }
- ```
-
-In `mxnet`, we have a function called `mx.lstm.inference` so that users can build an inference from an LSTM model, and then use the `mx.lstm.forward` function to get forward output from the inference.
-
-Build an inference from the model:
-
- ```r
- infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
- input.size=vocab,
- num.hidden=num.hidden,
- num.embed=num.embed,
- num.label=vocab,
- arg.params=model$arg.params,
- ctx=mx.cpu())
- ```
-Generate a sequence of 75 characters using the `mx.lstm.forward` function:
-
- ```r
- start <- 'a'
- seq.len <- 75
- random.sample <- TRUE
-
- last.id <- dic[[start]]
- out <- "a"
- for (i in (1:(seq.len-1))) {
- input <- c(last.id-1)
- ret <- mx.lstm.forward(infer.model, input, FALSE)
- infer.model <- ret$model
- prob <- ret$prob
- last.id <- make.output(prob, random.sample)
- out <- paste0(out, lookup.table[[last.id]])
- }
- cat (paste0(out, "\n"))
- ```
-The result:
+We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.
- ```
- ah not a drobl greens
- Settled asing lately sistering sounted to their hight
- ```
-Create Other RNN Models
-----------------
-In `mxnet`, other RNN models, like custom RNN and GRU, are also provided:
+```R
+library("readr")
+library("stringr")
+library("stringi")
+library("mxnet")
+```
-- For a custom RNN model, you can replace `mx.lstm` with `mx.rnn` to train an RNN model. You can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to build inference from an RNN model and get the forward result from the inference model.
-- For a GRU model, you can replace `mx.lstm` with `mx.gru` to train a GRU model. You can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to build inference from a GRU model and get the forward result from the inference model.
+## Preprocess and prepare the data
-## Next Steps
-* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
+Download the data:
+
+
+```R
+download.data <- function(data_dir) {
+ dir.create(data_dir, showWarnings = FALSE)
+ if (!file.exists(paste0(data_dir,'input.txt'))) {
+ download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
+ destfile=paste0(data_dir,'input.txt'), method='wget')
+ }
+}
+```
+
+Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len = 32, dic=NULL) {
+
+ text_vec <- read_file(file = path)
+ text_vec <- stri_enc_toascii(str = text_vec)
+ text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
+ text_vec <- strsplit(text_vec, '') %>% unlist
+
+ if (is.null(dic)) {
+ char_keep <- sort(unique(text_vec))
+ } else char_keep <- names(dic)[!dic == 0]
+
+ # Remove terms not part of dictionary
+ text_vec <- text_vec[text_vec %in% char_keep]
+
+ # Build dictionary
+ dic <- 1:length(char_keep)
+ names(dic) <- char_keep
+
+ # reverse dictionary
+ rev_dic <- names(dic)
+ names(rev_dic) <- dic
+
+ # Adjust by -1 to have a 1-lag for labels
+ num.seq <- (length(text_vec) - 1) %/% seq.len
+
+ features <- dic[text_vec[1:(seq.len * num.seq)]]
+ labels <- dic[text_vec[1:(seq.len*num.seq) + 1]]
+
+ features_array <- array(features, dim = c(seq.len, num.seq))
+ labels_array <- array(labels, dim = c(seq.len, num.seq))
+
+ return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
+}
+
+
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)
+```
+
+Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
+
+
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
+vocab <- length(dic)
+
+samples <- tail(dim(X), 1)
+train.val.fraction <- 0.9
+
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
+
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
+
+train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
+eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
+
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
+```
+
+Create iterators for training and evaluation datasets.
+
+
+```R
+vocab <- length(eval_buckets$dic)
+
+batch.size <- 32
+
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = TRUE)
+
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
+ data.mask.element = 0, shuffle = FALSE)
+```
+
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
+
+
+```R
+rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
+ num_hidden = 96,
+ input_size = vocab,
+ num_embed = 64,
+ num_decode = vocab,
+ dropout = 0.2,
+ ignore_label = 0,
+ cell_type = "lstm",
+ masking = F,
+ output_last_state = T,
+ loss_output = "softmax",
+ config = "one-to-one")
+
+graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
+ graph.height.px = 180, shape=c(100, 64))
+
+devices <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
+ clip_gradient = 5, rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
+batch.end.callback <- mx.callback.log.train.metric(period = 50)
+
+mx.metric.custom_nd <- function(name, feval) {
+ init <- function() {
+ c(0, 0)
+ }
+ update <- function(label, pred, state) {
+ m <- feval(label, pred)
+ state <- c(state[[1]] + 1, state[[2]] + m)
+ return(state)
+ }
+ get <- function(state) {
+ list(name=name, value = (state[[2]] / state[[1]]))
+ }
+ ret <- (list(init = init, update = update, get = get))
+ class(ret) <- "mx.metric"
+ return(ret)
+}
+
+mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
+ label <- mx.nd.reshape(label, shape = -1)
+ label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
+ batch <- length(label_probs)
+ NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
+ Perplexity <- exp(NLL)
+ return(Perplexity)
+})
+
+model <- mx.model.buckets(symbol = rnn_graph_one_one,
+ train.data = train.data, eval.data = eval.data,
+ num.round = 20, ctx = devices, verbose = TRUE,
+ metric = mx.metric.Perplexity,
+ initializer = initializer, optimizer = optimizer,
+ batch.end.callback = NULL,
+ epoch.end.callback = epoch.end.callback)
+
+mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
+```
+
+ Start training with 1 devices
+ [1] Train-Perplexity=13.7040474322178
+ [1] Validation-Perplexity=7.94617194460922
+ [2] Train-Perplexity=6.57039815554525
+ [2] Validation-Perplexity=6.60806110658011
+ [3] Train-Perplexity=5.65360504501481
+ [3] Validation-Perplexity=6.18932770630876
+ [4] Train-Perplexity=5.32547285727298
+ [4] Validation-Perplexity=6.02198756798859
+ [5] Train-Perplexity=5.14373631472579
+ [5] Validation-Perplexity=5.8095658243407
+ [6] Train-Perplexity=5.03077673487379
+ [6] Validation-Perplexity=5.72582993567431
+ [7] Train-Perplexity=4.94453383291536
+ [7] Validation-Perplexity=5.6445258528126
+ [8] Train-Perplexity=4.88635290100261
+ [8] Validation-Perplexity=5.6730024536433
+ [9] Train-Perplexity=4.84205646230548
+ [9] Validation-Perplexity=5.50960780230982
+ [10] Train-Perplexity=4.80441673535513
+ [10] Validation-Perplexity=5.57002263750006
+ [11] Train-Perplexity=4.77763413242626
+ [11] Validation-Perplexity=5.55152143269169
+ [12] Train-Perplexity=4.74937775290777
+ [12] Validation-Perplexity=5.44968305351486
+ [13] Train-Perplexity=4.72824849541467
+ [13] Validation-Perplexity=5.50889348298234
+ [14] Train-Perplexity=4.70980846981694
+ [14] Validation-Perplexity=5.51473225859859
+ [15] Train-Perplexity=4.69685776886122
+ [15] Validation-Perplexity=5.45391985233811
+ [16] Train-Perplexity=4.67837107034824
+ [16] Validation-Perplexity=5.46636764997829
+ [17] Train-Perplexity=4.66866961934873
+ [17] Validation-Perplexity=5.44267086113492
+ [18] Train-Perplexity=4.65611469144194
+ [18] Validation-Perplexity=5.4290169469462
+ [19] Train-Perplexity=4.64614689879405
+ [19] Validation-Perplexity=5.44221549833917
+ [20] Train-Perplexity=4.63764001963654
+ [20] Validation-Perplexity=5.42114250842862
+
+
+## Inference on the Model
+
+We now use the saved model to do inference and sample text character by character that will look like the original training data.
+
+
+```R
+set.seed(0)
+model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
+
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+infer_raw <- c("Thou ")
+infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
+infer_length <- length(infer_split)
+
+infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
+ batch.size = 1, shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = NULL,
+ ctx = devices)
+
+pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
+ infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length)))
+pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
+predict <- c(predict, pred)
+
+for (i in 1:200) {
+
+ infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
+ batch.size = 1, shuffle = FALSE)
+
+ infer <- mx.infer.rnn.one(infer.data = infer.data,
+ symbol = symbol,
+ arg.params = model$arg.params,
+ aux.params = model$aux.params,
+ input.params = list(rnn.state = infer[[2]],
+ rnn.state.cell = infer[[3]]),
+ ctx = devices)
+
+ pred_prob <- as.numeric(as.array(infer$loss_output))
+ pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
+ predict <- c(predict, pred)
+}
+
+predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
+predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
+print(predict_txt_tot)
+```
+
+ [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "
+
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->