You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by sk...@apache.org on 2018/09/28 18:58:49 UTC
[incubator-mxnet] branch master updated: [MXNET-833] [R] Char-level RNN tutorial fix (#12670)

This is an automated email from the ASF dual-hosted git repository.

skm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new ca81535  [MXNET-833] [R] Char-level RNN tutorial fix (#12670)
ca81535 is described below

commit ca81535ee75e964a1a6e0905567e6872f86db830
Author: Anirudh <an...@gmail.com>
AuthorDate: Fri Sep 28 11:58:37 2018 -0700

    [MXNET-833] [R] Char-level RNN tutorial fix (#12670)
    
    * char RNN tutorial
    
    * nit fixes
---
 R-package/vignettes/CharRnnModel.Rmd | 465 ++++++++++++++-------------
 docs/tutorials/r/charRnnModel.md     | 593 +++++++++++++++++------------------
 2 files changed, 529 insertions(+), 529 deletions(-)

diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
index 9d3fd5c..3c302bb 100644
--- a/R-package/vignettes/CharRnnModel.Rmd
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -1,32 +1,28 @@
-# Char RNN Example
 
+# Character-level Language Model using RNN
 
-## Load Data 
+This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
+ - readr
+ - stringr
+ - stringi
+ - mxnet
 
-First of all, load in the data and preprocess it.
+We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.
 
-```{r}
-require(mxnet)
-```
 
-Set basic network parameters.
-
-```{r}
-batch.size = 32
-seq.len = 32
-num.hidden = 16
-num.embed = 16
-num.lstm.layer = 1
-num.round = 1
-learning.rate= 0.1
-wd=0.00001
-clip_gradient=1
-update.period = 1
+```R
+library("readr")
+library("stringr")
+library("stringi")
+library("mxnet")
 ```
 
-Download the data.
+## Preprocess and prepare the data
+
+Download the data:
+
 
-```{r}
+```R
 download.data <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     if (!file.exists(paste0(data_dir,'input.txt'))) {
@@ -36,237 +32,262 @@ download.data <- function(data_dir) {
 }
 ```
 
-Make dictionary from text.
-
-```{r}
-make.dict <- function(text, max.vocab=10000) {
-    text <- strsplit(text, '')
-    dic <- list()
-    idx <- 1
-    for (c in text[[1]]) {
-        if (!(c %in% names(dic))) {
-            dic[[c]] <- idx
-            idx <- idx + 1
-        }
-    }
-    if (length(dic) == max.vocab - 1)
-        dic[["UNKNOWN"]] <- idx
-    cat(paste0("Total unique char: ", length(dic), "\n"))
-    return (dic)
-}
-```
-
-Transfer text into data feature.
-
-```{r}
-make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
-    fi <- file(file.path, "r")
-    text <- paste(readLines(fi), collapse="\n")
-    close(fi)
-
-    if (is.null(dic))
-        dic <- make.dict(text, max.vocab)
-    lookup.table <- list()
-    for (c in names(dic)) {
-        idx <- dic[[c]]
-        lookup.table[[idx]] <- c 
-    }
-
-    char.lst <- strsplit(text, '')[[1]]
-    num.seq <- as.integer(length(char.lst) / seq.len)
-    char.lst <- char.lst[1:(num.seq * seq.len)]
-    data <- array(0, dim=c(seq.len, num.seq))
-    idx <- 1
-    for (i in 1:num.seq) {
-        for (j in 1:seq.len) {
-            if (char.lst[idx] %in% names(dic))
-                data[j, i] <- dic[[ char.lst[idx] ]]-1
-            else {
-                data[j, i] <- dic[["UNKNOWN"]]-1
-            }
-            idx <- idx + 1
-        }
-    }
-    return (list(data=data, dic=dic, lookup.table=lookup.table))
+Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len = 32, dic=NULL) {
+  
+  text_vec <- read_file(file = path)
+  text_vec <- stri_enc_toascii(str = text_vec)
+  text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
+  text_vec <- strsplit(text_vec, '') %>% unlist
+  
+  if (is.null(dic)) {
+    char_keep <- sort(unique(text_vec))
+  } else char_keep <- names(dic)[!dic == 0]
+  
+  # Remove terms not part of dictionary
+  text_vec <- text_vec[text_vec %in% char_keep]
+  
+  # Build dictionary
+  dic <- 1:length(char_keep)
+  names(dic) <- char_keep
+  
+  # reverse dictionary
+  rev_dic <- names(dic)
+  names(rev_dic) <- dic
+  
+  # Adjust by -1 to have a 1-lag for labels
+  num.seq <- (length(text_vec) - 1) %/% seq.len
+  
+  features <- dic[text_vec[1:(seq.len * num.seq)]]
+  labels <- dic[text_vec[1:(seq.len * num.seq)+1]]
+  
+  features_array <- array(features, dim = c(seq.len, num.seq))
+  labels_array <- array(labels, dim = c(seq.len, num.seq))
+  
+  return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
 }
-```
 
-Move tail text.
-
-```{r}
-drop.tail <- function(X, batch.size) {
-    shape <- dim(X)
-    nstep <- as.integer(shape[2] / batch.size)
-    return (X[, 1:(nstep * batch.size)])
-}
-```
 
-Get the label of X
-
-```{r}
-get.label <- function(X) {
-    label <- array(0, dim=dim(X))
-    d <- dim(X)[1]
-    w <- dim(X)[2]
-    for (i in 0:(w-1)) {
-        for (j in 1:d) {
-            label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
-        }
-    }
-    return (label)
-}
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic = NULL)
 ```
 
-Get training data and eval data
+Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
 
-```{r}
-download.data("./data/")
-ret <- make.data("./data/input.txt", seq.len=seq.len)
-X <- ret$data
-dic <- ret$dic
-lookup.table <- ret$lookup.table
 
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
 vocab <- length(dic)
 
-shape <- dim(X)
+samples <- tail(dim(X), 1)
 train.val.fraction <- 0.9
-size <- shape[2]
 
-X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
-X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
-X.train.data <- drop.tail(X.train.data, batch.size)
-X.val.data <- drop.tail(X.val.data, batch.size)
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
 
-X.train.label <- get.label(X.train.data)
-X.val.label <- get.label(X.val.data)
-
-X.train <- list(data=X.train.data, label=X.train.label)
-X.val <- list(data=X.val.data, label=X.val.label)
-```
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
 
-## Training Model
-
-
-In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
-
-```{r}
-model <- mx.lstm(X.train, X.val, 
-                 ctx=mx.cpu(),
-                 num.round=num.round, 
-                 update.period=update.period,
-                 num.lstm.layer=num.lstm.layer, 
-                 seq.len=seq.len,
-                 num.hidden=num.hidden, 
-                 num.embed=num.embed, 
-                 num.label=vocab,
-                 batch.size=batch.size, 
-                 input.size=vocab,
-                 initializer=mx.init.uniform(0.1), 
-                 learning.rate=learning.rate,
-                 wd=wd,
-                 clip_gradient=clip_gradient)
+train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
+eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
 
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
 ```
 
-## Inference from model
+Create iterators for training and evaluation datasets.
 
 
-Some helper functions for random sample.
+```R
+vocab <- length(eval_buckets$dic)
 
-```{r}
-cdf <- function(weights) {
-    total <- sum(weights)
-    result <- c()
-    cumsum <- 0
-    for (w in weights) {
-        cumsum <- cumsum+w
-        result <- c(result, cumsum / total)
-    }
-    return (result)
-}
+batch.size <- 32
 
-search.val <- function(cdf, x) {
-    l <- 1
-    r <- length(cdf) 
-    while (l <= r) {
-        m <- as.integer((l+r)/2)
-        if (cdf[m] < x) {
-            l <- m+1
-        } else {
-            r <- m-1
-        }
-    }
-    return (l)
-}
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size, 
+                                data.mask.element = 0, shuffle = TRUE)
 
-choice <- function(weights) {
-    cdf.vals <- cdf(as.array(weights))
-    x <- runif(1)
-    idx <- search.val(cdf.vals, x)
-    return (idx)
-}
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
+                               data.mask.element = 0, shuffle = FALSE)
 ```
 
-We can use random output or fixed output by choosing largest probability.
-
-```{r}
-make.output <- function(prob, sample=FALSE) {
-    if (!sample) {
-        idx <- which.max(as.array(prob))
-    }
-    else {
-        idx <- choice(prob)
-    }
-    return (idx)
-
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
+
+
+```R
+rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3, 
+                               num_hidden = 96,
+                               input_size = vocab,
+                               num_embed = 64, 
+                               num_decode =vocab,
+                               dropout = 0.2, 
+                               ignore_label = 0,
+                               cell_type = "lstm",
+                               masking = F,
+                               output_last_state = T,
+                               loss_output = "softmax",
+                               config = "one-to-one")
+
+graph.viz(rnn_graph_one_one, type = "graph", direction = "LR", 
+          graph.height.px = 180, shape=c(100, 64))
+
+devices <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
+                           clip_gradient = 5, rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
+batch.end.callback <- mx.callback.log.train.metric(period = 50)
+
+mx.metric.custom_nd <- function(name, feval) {
+  init <- function() {
+    c(0, 0)
+  }
+  update <- function(label, pred, state) {
+    m <- feval(label, pred)
+    state <- c(state[[1]] + 1, state[[2]] + m)
+    return(state)
+  }
+  get <- function(state) {
+    list(name = name, value = (state[[2]]/state[[1]]))
+  }
+  ret <- (list(init = init, update = update, get = get))
+  class(ret) <- "mx.metric"
+  return(ret)
 }
-```
 
-In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
-
-```{r}
-infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
-                                 input.size=vocab,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=vocab,
-                                 arg.params=model$arg.params,
-                                 ctx=mx.cpu())
+mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
+  label <- mx.nd.reshape(label, shape = -1)
+  label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
+  batch <- length(label_probs)
+  NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
+  Perplexity <- exp(NLL)
+  return(Perplexity)
+})
+
+model <- mx.model.buckets(symbol = rnn_graph_one_one,
+                          train.data = train.data, eval.data = eval.data, 
+                          num.round = 20, ctx = devices, verbose = TRUE,
+                          metric = mx.metric.Perplexity, 
+                          initializer = initializer, optimizer = optimizer, 
+                          batch.end.callback = NULL, 
+                          epoch.end.callback = epoch.end.callback)
+
+mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
 ```
 
-Generate a sequence of 75 chars using function `mx.lstm.forward`.
-
-```{r}
-start <- 'a'
-seq.len <- 75
-random.sample <- TRUE
-
-last.id <- dic[[start]]
-out <- "a"
-for (i in (1:(seq.len-1))) {
-    input <- c(last.id-1)
-    ret <- mx.lstm.forward(infer.model, input, FALSE)
-    infer.model <- ret$model
-    prob <- ret$prob
-    last.id <- make.output(prob, random.sample)
-    out <- paste0(out, lookup.table[[last.id]])
+    Start training with 1 devices
+    [1] Train-Perplexity=13.7040474322178
+    [1] Validation-Perplexity=7.94617194460922
+    [2] Train-Perplexity=6.57039815554525
+    [2] Validation-Perplexity=6.60806110658011
+    [3] Train-Perplexity=5.65360504501481
+    [3] Validation-Perplexity=6.18932770630876
+    [4] Train-Perplexity=5.32547285727298
+    [4] Validation-Perplexity=6.02198756798859
+    [5] Train-Perplexity=5.14373631472579
+    [5] Validation-Perplexity=5.8095658243407
+    [6] Train-Perplexity=5.03077673487379
+    [6] Validation-Perplexity=5.72582993567431
+    [7] Train-Perplexity=4.94453383291536
+    [7] Validation-Perplexity=5.6445258528126
+    [8] Train-Perplexity=4.88635290100261
+    [8] Validation-Perplexity=5.6730024536433
+    [9] Train-Perplexity=4.84205646230548
+    [9] Validation-Perplexity=5.50960780230982
+    [10] Train-Perplexity=4.80441673535513
+    [10] Validation-Perplexity=5.57002263750006
+    [11] Train-Perplexity=4.77763413242626
+    [11] Validation-Perplexity=5.55152143269169
+    [12] Train-Perplexity=4.74937775290777
+    [12] Validation-Perplexity=5.44968305351486
+    [13] Train-Perplexity=4.72824849541467
+    [13] Validation-Perplexity=5.50889348298234
+    [14] Train-Perplexity=4.70980846981694
+    [14] Validation-Perplexity=5.51473225859859
+    [15] Train-Perplexity=4.69685776886122
+    [15] Validation-Perplexity=5.45391985233811
+    [16] Train-Perplexity=4.67837107034824
+    [16] Validation-Perplexity=5.46636764997829
+    [17] Train-Perplexity=4.66866961934873
+    [17] Validation-Perplexity=5.44267086113492
+    [18] Train-Perplexity=4.65611469144194
+    [18] Validation-Perplexity=5.4290169469462
+    [19] Train-Perplexity=4.64614689879405
+    [19] Validation-Perplexity=5.44221549833917
+    [20] Train-Perplexity=4.63764001963654
+    [20] Validation-Perplexity=5.42114250842862
+
+
+## Inference on the Model
+
+We now use the saved model to do inference and sample text character by character that will look like the original training data.
+
+
+```R
+set.seed(0)
+model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
+
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+infer_raw <- c("Thou ")
+infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
+infer_length <- length(infer_split)
+
+infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),  
+                              batch.size = 1, shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data, 
+                          symbol = symbol,
+                          arg.params = model$arg.params,
+                          aux.params = model$aux.params,
+                          input.params = NULL, 
+                          ctx = devices)
+
+pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
+    infer$loss_output, axis=0, begin = infer_length-1, end = infer_length)))
+pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
+predict <- c(predict, pred)
+
+for (i in 1:200) {
+  
+  infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),  
+                                batch.size = 1, shuffle = FALSE)
+  
+  infer <- mx.infer.rnn.one(infer.data = infer.data, 
+                            symbol = symbol,
+                            arg.params = model$arg.params,
+                            aux.params = model$aux.params,
+                            input.params = list(rnn.state = infer[[2]], 
+                                                rnn.state.cell = infer[[3]]), 
+                            ctx = devices)
+  
+  pred_prob <- as.numeric(as.array(infer$loss_output))
+  pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
+  predict <- c(predict, pred)
 }
-message(out)
-```
-The result:
 
+predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
+predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
+print(predict_txt_tot)
 ```
-ah not a drobl greens
-Settled asing lately sistering sounted to their hight
-```
-
-## Other RNN models
-
-In `mxnet`, other RNN models like custom RNN and gru is also provided.
-- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
-
-- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.
 
+    [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "
 
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/r/charRnnModel.md b/docs/tutorials/r/charRnnModel.md
index 82e10a1..cb21e77 100644
--- a/docs/tutorials/r/charRnnModel.md
+++ b/docs/tutorials/r/charRnnModel.md
@@ -1,314 +1,293 @@
-Char RNN Example
-=============================================
-
-This tutorial shows how to use an LSTM model to build a char-level language model, and generate text from it. For demonstration purposes, we use a Shakespearean text. You can find the data on [GitHub](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
-
-Load the Data
----------
-Load in the data and preprocess it:
-
- ```r
-    require(mxnet)
- ```
-
- ```
-    ## Loading required package: mxnet
- ```
-
- ```
-    ## Loading required package: methods
- ```
-Set the basic network parameters:
-
- ```r
-    batch.size = 32
-    seq.len = 32
-    num.hidden = 16
-    num.embed = 16
-    num.lstm.layer = 1
-    num.round = 1
-    learning.rate= 0.1
-    wd=0.00001
-    clip_gradient=1
-    update.period = 1
- ```
-Download the data:
 
- ```r
-    download.data <- function(data_dir) {
-        dir.create(data_dir, showWarnings = FALSE)
-        if (!file.exists(paste0(data_dir,'input.txt'))) {
-            download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
-                          destfile=paste0(data_dir,'input.txt'), method='wget')
-        }
-    }
- ```
-Make a dictionary from the text:
-
- ```r
-    make.dict <- function(text, max.vocab=10000) {
-        text <- strsplit(text, '')
-        dic <- list()
-        idx <- 1
-        for (c in text[[1]]) {
-            if (!(c %in% names(dic))) {
-                dic[[c]] <- idx
-                idx <- idx + 1
-            }
-        }
-        if (length(dic) == max.vocab - 1)
-            dic[["UNKNOWN"]] <- idx
-        cat(paste0("Total unique char: ", length(dic), "\n"))
-        return (dic)
-     }
- ```
-Transfer the text into a data feature:
-
- ```r
-    make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL)      {
-        fi <- file(file.path, "r")
-        text <- paste(readLines(fi), collapse="\n")
-        close(fi)
-
-        if (is.null(dic))
-            dic <- make.dict(text, max.vocab)
-        lookup.table <- list()
-        for (c in names(dic)) {
-            idx <- dic[[c]]
-            lookup.table[[idx]] <- c
-         }
-
-        char.lst <- strsplit(text, '')[[1]]
-        num.seq <- as.integer(length(char.lst) / seq.len)
-        char.lst <- char.lst[1:(num.seq * seq.len)]
-        data <- array(0, dim=c(seq.len, num.seq))
-        idx <- 1
-        for (i in 1:num.seq) {
-             for (j in 1:seq.len) {
-                 if (char.lst[idx] %in% names(dic))
-                    data[j, i] <- dic[[ char.lst[idx] ]]-1
-                else {
-                    data[j, i] <- dic[["UNKNOWN"]]-1
-                }
-                idx <- idx + 1
-            }
-        }
-         return (list(data=data, dic=dic, lookup.table=lookup.table))
-    }
- ```
-Move the tail text:
-
- ```r
-    drop.tail <- function(X, batch.size) {
-        shape <- dim(X)
-        nstep <- as.integer(shape[2] / batch.size)
-        return (X[, 1:(nstep * batch.size)])
-    }
- ```
-Get the label of X:
-
- ```r
-    get.label <- function(X) {
-        label <- array(0, dim=dim(X))
-        d <- dim(X)[1]
-        w <- dim(X)[2]
-        for (i in 0:(w-1)) {
-            for (j in 1:d) {
-                label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
-            }
-        }
-        return (label)
-    }
- ```
-Get the training data and evaluation data:
-
- ```r
-    download.data("./data/")
-    ret <- make.data("./data/input.txt", seq.len=seq.len)
- ```
-
- ```
-    ## Total unique char: 65
- ```
-
- ```r
-    X <- ret$data
-    dic <- ret$dic
-    lookup.table <- ret$lookup.table
-
-    vocab <- length(dic)
-
-    shape <- dim(X)
-    train.val.fraction <- 0.9
-    size <- shape[2]
-
-    X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
-    X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
-    X.train.data <- drop.tail(X.train.data, batch.size)
-    X.val.data <- drop.tail(X.val.data, batch.size)
-
-    X.train.label <- get.label(X.train.data)
-    X.val.label <- get.label(X.val.data)
-
-    X.train <- list(data=X.train.data, label=X.train.label)
-    X.val <- list(data=X.val.data, label=X.val.label)
- ```
-
-Train the Model
---------------
-In `mxnet`, we have a function called `mx.lstm` so that users can build a general LSTM model:
-
-
- ```r
-    model <- mx.lstm(X.train, X.val,
-                     ctx=mx.cpu(),
-                     num.round=num.round,
-                     update.period=update.period,
-                     num.lstm.layer=num.lstm.layer,
-                     seq.len=seq.len,
-                     num.hidden=num.hidden,
-                     num.embed=num.embed,
-                     num.label=vocab,
-                     batch.size=batch.size,
-                     input.size=vocab,
-                     initializer=mx.init.uniform(0.1),
-                     learning.rate=learning.rate,
-                     wd=wd,
-                     clip_gradient=clip_gradient)
- ```
-
- ```
-    ## Epoch [31] Train: NLL=3.53787130224343, Perp=34.3936275728271
-    ## Epoch [62] Train: NLL=3.43087958036949, Perp=30.903813186055
-    ## Epoch [93] Train: NLL=3.39771238228587, Perp=29.8956319855751
-    ## Epoch [124] Train: NLL=3.37581711716687, Perp=29.2481732041015
-    ## Epoch [155] Train: NLL=3.34523331338447, Perp=28.3671933405139
-    ## Epoch [186] Train: NLL=3.30756356274787, Perp=27.31848454823
-    ## Epoch [217] Train: NLL=3.25642968403829, Perp=25.9566978956055
-    ## Epoch [248] Train: NLL=3.19825967486207, Perp=24.4898727477925
-    ## Epoch [279] Train: NLL=3.14013971549828, Perp=23.1070950525017
-    ## Epoch [310] Train: NLL=3.08747601837462, Perp=21.9216781782189
-    ## Epoch [341] Train: NLL=3.04015595674863, Perp=20.9085038031042
-    ## Epoch [372] Train: NLL=2.99839339255659, Perp=20.0532932584534
-    ## Epoch [403] Train: NLL=2.95940091012609, Perp=19.2864139984503
-    ## Epoch [434] Train: NLL=2.92603311380224, Perp=18.6534872738302
-    ## Epoch [465] Train: NLL=2.89482756896395, Perp=18.0803835531869
-    ## Epoch [496] Train: NLL=2.86668230478397, Perp=17.5786009078994
-    ## Epoch [527] Train: NLL=2.84089368534943, Perp=17.1310684830416
-    ## Epoch [558] Train: NLL=2.81725862932279, Perp=16.7309220880514
-    ## Epoch [589] Train: NLL=2.79518870141492, Perp=16.3657166956952
-    ## Epoch [620] Train: NLL=2.77445683225304, Perp=16.0299176962855
-    ## Epoch [651] Train: NLL=2.75490970113174, Perp=15.719621374694
-    ## Epoch [682] Train: NLL=2.73697900634351, Perp=15.4402696117257
-    ## Epoch [713] Train: NLL=2.72059739336781, Perp=15.1893935780915
-    ## Epoch [744] Train: NLL=2.70462837571585, Perp=14.948760335793
-    ## Epoch [775] Train: NLL=2.68909904683828, Perp=14.7184093476224
-    ## Epoch [806] Train: NLL=2.67460054451836, Perp=14.5065539595711
-    ## Epoch [837] Train: NLL=2.66078997776751, Perp=14.3075873113043
-    ## Epoch [868] Train: NLL=2.6476781639279, Perp=14.1212134100373
-    ## Epoch [899] Train: NLL=2.63529039846876, Perp=13.9473621677371
-    ## Epoch [930] Train: NLL=2.62367693518974, Perp=13.7863219168709
-    ## Epoch [961] Train: NLL=2.61238282674384, Perp=13.6314936713501
-    ## Iter [1] Train: Time: 10301.6818172932 sec, NLL=2.60536539345356, Perp=13.5361704272949
-    ## Iter [1] Val: NLL=2.26093848746227, Perp=9.59208699731232
- ```
-
-Build Inference from the Model
---------------------
-Use the helper function for random sample:
-
- ```r
-    cdf <- function(weights) {
-        total <- sum(weights)
-        result <- c()
-        cumsum <- 0
-        for (w in weights) {
-            cumsum <- cumsum+w
-            result <- c(result, cumsum / total)
-        }
-        return (result)
-    }
+# Character-level Language Model using RNN
 
-    search.val <- function(cdf, x) {
-        l <- 1
-        r <- length(cdf)
-        while (l <= r) {
-            m <- as.integer((l+r)/2)
-            if (cdf[m] < x) {
-                l <- m+1
-            } else {
-                r <- m-1
-            }
-        }
-        return (l)
-    }
-    choice <- function(weights) {
-        cdf.vals <- cdf(as.array(weights))
-        x <- runif(1)
-        idx <- search.val(cdf.vals, x)
-        return (idx)
-    }
- ```
-Use random output or fixed output by choosing the greatest probability:
-
- ```r
-    make.output <- function(prob, sample=FALSE) {
-        if (!sample) {
-            idx <- which.max(as.array(prob))
-        }
-        else {
-            idx <- choice(prob)
-        }
-        return (idx)
+This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
+ - readr
+ - stringr
+ - stringi
+ - mxnet
 
-    }
- ```
-
-In `mxnet`, we have a function called `mx.lstm.inference` so that users can build an inference from an LSTM model, and then use the `mx.lstm.forward` function to get forward output from the inference.
-
-Build an inference from the model:
-
- ```r
-    infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
-                                     input.size=vocab,
-                                     num.hidden=num.hidden,
-                                     num.embed=num.embed,
-                                     num.label=vocab,
-                                     arg.params=model$arg.params,
-                                     ctx=mx.cpu())
- ```
-Generate a sequence of 75 characters using the  `mx.lstm.forward` function:
-
- ```r
-    start <- 'a'
-    seq.len <- 75
-    random.sample <- TRUE
-
-    last.id <- dic[[start]]
-    out <- "a"
-    for (i in (1:(seq.len-1))) {
-        input <- c(last.id-1)
-        ret <- mx.lstm.forward(infer.model, input, FALSE)
-        infer.model <- ret$model
-        prob <- ret$prob
-        last.id <- make.output(prob, random.sample)
-        out <- paste0(out, lookup.table[[last.id]])
-    }
-    cat (paste0(out, "\n"))
- ```
-The result:
+We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.
 
- ```
-    ah not a drobl greens
-    Settled asing lately sistering sounted to their hight
- ```
 
-Create Other RNN Models
-----------------
-In `mxnet`, other RNN models, like custom RNN and GRU, are also provided:
+```R
+library("readr")
+library("stringr")
+library("stringi")
+library("mxnet")
+```
 
-- For a custom RNN model, you can replace `mx.lstm` with `mx.rnn` to train an RNN model. You can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to build inference from an RNN model and get the forward result from the inference model.
-- For a GRU model, you can replace `mx.lstm` with `mx.gru` to train a GRU model. You can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to build inference from a GRU model and get the forward result from the inference model.
+## Preprocess and prepare the data
 
-## Next Steps
-* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
+Download the data:
+
+
+```R
+download.data <- function(data_dir) {
+    dir.create(data_dir, showWarnings = FALSE)
+    if (!file.exists(paste0(data_dir,'input.txt'))) {
+        download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
+                      destfile=paste0(data_dir,'input.txt'), method='wget')
+    }
+}
+```
+
+Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len = 32, dic=NULL) {
+  
+  text_vec <- read_file(file = path)
+  text_vec <- stri_enc_toascii(str = text_vec)
+  text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
+  text_vec <- strsplit(text_vec, '') %>% unlist
+  
+  if (is.null(dic)) {
+    char_keep <- sort(unique(text_vec))
+  } else char_keep <- names(dic)[!dic == 0]
+  
+  # Remove terms not part of dictionary
+  text_vec <- text_vec[text_vec %in% char_keep]
+  
+  # Build dictionary
+  dic <- 1:length(char_keep)
+  names(dic) <- char_keep
+  
+  # reverse dictionary
+  rev_dic <- names(dic)
+  names(rev_dic) <- dic
+  
+  # Adjust by -1 to have a 1-lag for labels
+  num.seq <- (length(text_vec) - 1) %/% seq.len
+  
+  features <- dic[text_vec[1:(seq.len * num.seq)]] 
+  labels <- dic[text_vec[1:(seq.len*num.seq) + 1]]
+  
+  features_array <- array(features, dim = c(seq.len, num.seq))
+  labels_array <- array(labels, dim = c(seq.len, num.seq))
+  
+  return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
+}
+
+
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)
+```
+
+Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
+
+
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
+vocab <- length(dic)
+
+samples <- tail(dim(X), 1)
+train.val.fraction <- 0.9
+
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
+
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
+
+train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
+eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))
+
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
+```
+
+Create iterators for training and evaluation datasets.
+
+
+```R
+vocab <- length(eval_buckets$dic)
+
+batch.size <- 32
+
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size, 
+                                data.mask.element = 0, shuffle = TRUE)
+
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
+                               data.mask.element = 0, shuffle = FALSE)
+```
+
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
+
+
+```R
+rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3, 
+                               num_hidden = 96,
+                               input_size = vocab,
+                               num_embed = 64, 
+                               num_decode = vocab,
+                               dropout = 0.2, 
+                               ignore_label = 0,
+                               cell_type = "lstm",
+                               masking = F,
+                               output_last_state = T,
+                               loss_output = "softmax",
+                               config = "one-to-one")
+
+graph.viz(rnn_graph_one_one, type = "graph", direction = "LR", 
+          graph.height.px = 180, shape=c(100, 64))
+
+devices <- mx.cpu()
+
+initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)
+
+optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
+                           clip_gradient = 5, rescale.grad = 1/batch.size)
+
+logger <- mx.metric.logger()
+epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
+batch.end.callback <- mx.callback.log.train.metric(period = 50)
+
+mx.metric.custom_nd <- function(name, feval) {
+  init <- function() {
+    c(0, 0)
+  }
+  update <- function(label, pred, state) {
+    m <- feval(label, pred)
+    state <- c(state[[1]] + 1, state[[2]] + m)
+    return(state)
+  }
+  get <- function(state) {
+    list(name=name, value = (state[[2]] / state[[1]]))
+  }
+  ret <- (list(init = init, update = update, get = get))
+  class(ret) <- "mx.metric"
+  return(ret)
+}
+
+mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
+  label <- mx.nd.reshape(label, shape = -1)
+  label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
+  batch <- length(label_probs)
+  NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
+  Perplexity <- exp(NLL)
+  return(Perplexity)
+})
+
+model <- mx.model.buckets(symbol = rnn_graph_one_one,
+                          train.data = train.data, eval.data = eval.data, 
+                          num.round = 20, ctx = devices, verbose = TRUE,
+                          metric = mx.metric.Perplexity, 
+                          initializer = initializer, optimizer = optimizer, 
+                          batch.end.callback = NULL, 
+                          epoch.end.callback = epoch.end.callback)
+
+mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
+```
+
+    Start training with 1 devices
+    [1] Train-Perplexity=13.7040474322178
+    [1] Validation-Perplexity=7.94617194460922
+    [2] Train-Perplexity=6.57039815554525
+    [2] Validation-Perplexity=6.60806110658011
+    [3] Train-Perplexity=5.65360504501481
+    [3] Validation-Perplexity=6.18932770630876
+    [4] Train-Perplexity=5.32547285727298
+    [4] Validation-Perplexity=6.02198756798859
+    [5] Train-Perplexity=5.14373631472579
+    [5] Validation-Perplexity=5.8095658243407
+    [6] Train-Perplexity=5.03077673487379
+    [6] Validation-Perplexity=5.72582993567431
+    [7] Train-Perplexity=4.94453383291536
+    [7] Validation-Perplexity=5.6445258528126
+    [8] Train-Perplexity=4.88635290100261
+    [8] Validation-Perplexity=5.6730024536433
+    [9] Train-Perplexity=4.84205646230548
+    [9] Validation-Perplexity=5.50960780230982
+    [10] Train-Perplexity=4.80441673535513
+    [10] Validation-Perplexity=5.57002263750006
+    [11] Train-Perplexity=4.77763413242626
+    [11] Validation-Perplexity=5.55152143269169
+    [12] Train-Perplexity=4.74937775290777
+    [12] Validation-Perplexity=5.44968305351486
+    [13] Train-Perplexity=4.72824849541467
+    [13] Validation-Perplexity=5.50889348298234
+    [14] Train-Perplexity=4.70980846981694
+    [14] Validation-Perplexity=5.51473225859859
+    [15] Train-Perplexity=4.69685776886122
+    [15] Validation-Perplexity=5.45391985233811
+    [16] Train-Perplexity=4.67837107034824
+    [16] Validation-Perplexity=5.46636764997829
+    [17] Train-Perplexity=4.66866961934873
+    [17] Validation-Perplexity=5.44267086113492
+    [18] Train-Perplexity=4.65611469144194
+    [18] Validation-Perplexity=5.4290169469462
+    [19] Train-Perplexity=4.64614689879405
+    [19] Validation-Perplexity=5.44221549833917
+    [20] Train-Perplexity=4.63764001963654
+    [20] Validation-Perplexity=5.42114250842862
+
+
+## Inference on the Model
+
+We now use the saved model to do inference and sample text character by character that will look like the original training data.
+
+
+```R
+set.seed(0)
+model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)
+
+internals <- model$symbol$get.internals()
+sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
+sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
+sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
+symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)
+
+infer_raw <- c("Thou ")
+infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
+infer_length <- length(infer_split)
+
+infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),  
+                              batch.size = 1, shuffle = FALSE)
+
+infer <- mx.infer.rnn.one(infer.data = infer.data, 
+                          symbol = symbol,
+                          arg.params = model$arg.params,
+                          aux.params = model$aux.params,
+                          input.params = NULL, 
+                          ctx = devices)
+
+pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
+    infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length)))
+pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
+predict <- c(predict, pred)
+
+for (i in 1:200) {
+  
+  infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),  
+                                batch.size = 1, shuffle = FALSE)
+  
+  infer <- mx.infer.rnn.one(infer.data = infer.data, 
+                            symbol = symbol,
+                            arg.params = model$arg.params,
+                            aux.params = model$aux.params,
+                            input.params = list(rnn.state = infer[[2]], 
+                                                rnn.state.cell = infer[[3]]), 
+                            ctx = devices)
+  
+  pred_prob <- as.numeric(as.array(infer$loss_output))
+  pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
+  predict <- c(predict, pred)
+}
+
+predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
+predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
+print(predict_txt_tot)
+```
+
+    [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "
+
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->