You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by to...@apache.org on 2016/11/08 15:33:36 UTC
svn commit: r1768715 - in /labs/yay/trunk/core/src: main/java/org/apache/yay/ test/java/org/apache/yay/ test/resources/word2vec/

Author: tommaso
Date: Tue Nov  8 15:33:35 2016
New Revision: 1768715

URL: http://svn.apache.org/viewvc?rev=1768715&view=rev
Log:
ce loss fixes, sRnn bptt fix

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
    labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/NNRunner.java Tue Nov  8 15:33:35 2016
@@ -58,19 +58,19 @@ public class NNRunner {
             } catch (IOException e) {
               throw new RuntimeException("could not read from path " + path);
             }
-          }
+          } // use chars or words
           if (args.length > 2 && args[2] != null) {
             useChars = Boolean.valueOf(args[2]);
-          }
+          } // no. of epochs
           if (args.length > 3 && args[3] != null) {
             epochs = Integer.valueOf(args[3]);
-          }
+          } // hidden layer size
           if (args.length > 4 && args[4] != null) {
             hiddenLayerSize = Integer.valueOf(args[4]);
-          }
+          } // unrolled sequence lenght
           if (args.length > 5 && args[5] != null) {
             seqLength = Integer.valueOf(args[5]);
-          }
+          } // learning rate
           if (args.length > 6 && args[6] != null) {
             learningRate = Float.valueOf(args[6]);
           }

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java Tue Nov  8 15:33:35 2016
@@ -240,7 +240,7 @@ public class RNN {
         ps = init(inputs.length(), pst);
       }
       ps.putRow(t, pst);
-      loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue(); // softmax (cross-entropy loss)
+      loss += -Math.log(pst.getDouble(targets.getInt(t))); // softmax (cross-entropy loss)
     }
 
     this.hPrev = hs.getRow(inputs.length() - 1);

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java Tue Nov  8 15:33:35 2016
@@ -54,6 +54,8 @@ public class StackedRNN extends RNN {
   private final INDArray bh2; // hidden2 bias
   private final INDArray by; // output bias
 
+  private final double reg = 1e-8;
+
   private INDArray hPrev = null; // memory state
   private INDArray hPrev2 = null; // memory state
 
@@ -137,25 +139,25 @@ public class StackedRNN extends RNN {
 
       // perform parameter update with Adagrad
       mWxh.addi(dWxh.mul(dWxh));
-      wxh.subi((dWxh.mul(learningRate)).div(Transforms.sqrt(mWxh.add(1e-8))));
+      wxh.subi((dWxh.mul(learningRate)).div(Transforms.sqrt(mWxh.add(reg))));
 
       mWhh.addi(dWhh.mul(dWhh));
-      whh.subi(dWhh.mul(learningRate).div(Transforms.sqrt(mWhh.add(1e-8))));
+      whh.subi(dWhh.mul(learningRate).div(Transforms.sqrt(mWhh.add(reg))));
 
       mWhh2.addi(dWhh2.mul(dWhh2));
-      whh2.subi(dWhh2.mul(learningRate).div(Transforms.sqrt(mWhh2.add(1e-8))));
+      whh2.subi(dWhh2.mul(learningRate).div(Transforms.sqrt(mWhh2.add(reg))));
 
       mbh2.addi(dbh2.mul(dbh2));
-      bh2.subi(dbh2.mul(learningRate).div(Transforms.sqrt(mbh2.add(1e-8))));
+      bh2.subi(dbh2.mul(learningRate).div(Transforms.sqrt(mbh2.add(reg))));
 
       mWh2y.addi(dWh2y.mul(dWh2y));
-      wh2y.subi(dWh2y.mul(learningRate).div(Transforms.sqrt(mWh2y.add(1e-8))));
+      wh2y.subi(dWh2y.mul(learningRate).div(Transforms.sqrt(mWh2y.add(reg))));
 
       mbh.addi(dbh.mul(dbh));
-      bh.subi(dbh.mul(learningRate).div(Transforms.sqrt(mbh.add(1e-8))));
+      bh.subi(dbh.mul(learningRate).div(Transforms.sqrt(mbh.add(reg))));
 
       mby.addi(dby.mul(dby));
-      by.subi(dby.mul(learningRate).div(Transforms.sqrt(mby.add(1e-8))));
+      by.subi(dby.mul(learningRate).div(Transforms.sqrt(mby.add(reg))));
 
       p += seqLength; // move data pointer
       n++; // iteration counter
@@ -176,11 +178,8 @@ public class StackedRNN extends RNN {
     INDArray ys = null;
     INDArray ps = null;
 
-    INDArray hs1 = Nd4j.create(hPrev.shape());
-    Nd4j.copy(hPrev, hs1);
-
-    INDArray hs12 = Nd4j.create(hPrev2.shape());
-    Nd4j.copy(hPrev2, hs12);
+    INDArray hs1 = hPrev.dup();
+    INDArray hs12 = hPrev2.dup();
 
     double loss = 0;
 
@@ -192,15 +191,13 @@ public class StackedRNN extends RNN {
       INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
       INDArray xst = xs.getRow(t);
       INDArray hst = Transforms.tanh((wxh.mmul(xst.transpose())).add((whh.mmul(hsRow)).add(bh))); // hidden state
-//      INDArray hst = Transforms.relu((wxh.mmul(xs.getRow(t).transpose())).add((whh.mmul(hsRow)).add(bh))); // hidden state
       if (hs == null) {
         hs = init(inputs.length(), hst);
       }
       hs.putRow(t, hst);
 
       INDArray hs2Row = t == 0 ? hs12 : hs2.getRow(t - 1);
-      INDArray hst2 = Transforms.tanh((whh.mmul(hs.getRow(t))).add((whh2.mmul(hs2Row)).add(bh2))); // hidden state 2
-//      INDArray hst2 = Transforms.relu((whh.mmul(hs.getRow(t))).add((whh2.mmul(hs2Row)).add(bh2))); // hidden state 2
+      INDArray hst2 = Transforms.tanh((whh.mmul(hst)).add((whh2.mmul(hs2Row)).add(bh2))); // hidden state 2
       if (hs2 == null) {
         hs2 = init(inputs.length(), hst2);
       }
@@ -217,7 +214,9 @@ public class StackedRNN extends RNN {
         ps = init(inputs.length(), pst);
       }
       ps.putRow(t, pst);
-      loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue(); // softmax (cross-entropy loss)
+
+      int targetsInt = targets.getInt(t);
+      loss += -Math.log(pst.getDouble(targetsInt)); // softmax (cross-entropy loss)
     }
 
     // backward pass: compute gradients going backwards
@@ -225,36 +224,32 @@ public class StackedRNN extends RNN {
     INDArray dh2Next = Nd4j.zerosLike(hs2.getRow(0));
     for (int t = inputs.length() - 1; t >= 0; t--) {
 
-      INDArray dy = ps.getRow(t).dup(); // dy = np.copy(ps[t])
-      int targetsInt = targets.getInt(t);
-      INDArray dyRow = dy.getRow(targetsInt);
-      dy.putRow(targetsInt, dyRow.sub(1)); // backprop into y
+      INDArray dy = ps.getRow(t).dup();
+      dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into y
 
       INDArray hs2t = hs2.getRow(t);
       INDArray hs2tm1 = t == 0 ? hs12 : hs2.getRow(t - 1);
 
-      dWh2y.addi(dy.mmul(hs2t.transpose())); // dWhy += np.dot(dy, hs[t].T)
-      dby.addi(dy); // dby += dy
+      dWh2y.addi(dy.mmul(hs2t.transpose()));
+      dby.addi(dy);
 
-      INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // dh = np.dot(Why.T, dy) + dhnext # backprop into h2
+      INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // backprop into h2
 
       INDArray dhraw2 = (Nd4j.ones(hs2t.shape()).sub(hs2t.mul(hs2t))).mul(dh2); //  backprop through tanh nonlinearity
-//      INDArray dhraw2 = Nd4j.getExecutioner().execAndReturn(new SetRange(hst2, 0, Double.MAX_VALUE)).mul(dh2); // backprop through relu nonlinearity
-      dbh2.addi(dhraw2); // dbh += dhraw
+      dbh2.addi(dhraw2);
       INDArray hst = hs.getRow(t);
-      dWhh.addi(dhraw2.mmul(hst.transpose())); // dWxh += np.dot(dhraw, xs[t].T)
-      dWhh2.addi(dhraw2.mmul(hs2tm1.transpose())); // dWhh += np.dot(dhraw, hs[t-1].T)
-      dh2Next = whh2.transpose().mmul(dhraw2); // dhnext = np.dot(Whh.T, dhraw)
+      dWhh.addi(dhraw2.mmul(hst.transpose()));
+      dWhh2.addi(dhraw2.mmul(hs2tm1.transpose()));
+      dh2Next = whh2.transpose().mmul(dhraw2);
 
-      INDArray dh = whh2.transpose().mmul(dh2).add(dhNext); // backprop into h
+      INDArray dh = whh2.transpose().mmul(dhraw2).add(dhNext); // backprop into h
       INDArray dhraw = (Nd4j.ones(hst.shape()).sub(hst.mul(hst))).mul(dh); // backprop through tanh nonlinearity
-//      INDArray dhraw = Nd4j.getExecutioner().execAndReturn(new SetRange(hst, 0, Double.MAX_VALUE)).mul(dh); // backprop through relu nonlinearity
       dbh.addi(dhraw);
 
-      dWxh.addi(dhraw.mmul(xs.getRow(t))); // dWxh += np.dot(dhraw, xs[t].T)
+      dWxh.addi(dhraw.mmul(xs.getRow(t)));
       INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
-      dWhh.addi(dhraw.mmul(hsRow.transpose())); // dWhh += np.dot(dhraw, hs[t-1].T)
-      dhNext = whh.transpose().mmul(dhraw); // dhnext = np.dot(Whh.T, dhraw)
+      dWhh.addi(dhraw.mmul(hsRow.transpose()));
+      dhNext = whh.transpose().mmul(dhraw);
 
     }
 
@@ -262,13 +257,14 @@ public class StackedRNN extends RNN {
     this.hPrev2 = hs2.getRow(inputs.length() - 1);
 
     // clip exploding gradients
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -5, 5));
+    int clip = 5;
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -clip, clip));
+    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -clip, clip));
 
     return loss;
   }
@@ -289,9 +285,7 @@ public class StackedRNN extends RNN {
 
     for (int t = 0; t < sampleSize; t++) {
       h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));
-//      INDArray h = Transforms.relu((wxh.mmul(x)).add((whh.mmul(hPrev)).add(bh)));
       h2 = Transforms.tanh((whh.mmul(h)).add((whh2.mmul(h2)).add(bh2)));
-//      INDArray h2 = Transforms.relu((whh.mmul(h)).add((whh2.mmul(hPrev2)).add(bh2)));
       INDArray y = (wh2y.mmul(h2)).add(by);
       INDArray pm = Nd4j.getExecutioner().execAndReturn(new SoftMax(y)).ravel();
 

Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java (original)
+++ labs/yay/trunk/core/src/test/java/org/apache/yay/RNNCrossValidationTest.java Tue Nov  8 15:33:35 2016
@@ -41,7 +41,7 @@ public class RNNCrossValidationTest {
   private int hiddenLayerSize;
   private Random r = new Random();
   private String text;
-  private final int epochs = 5;
+  private final int epochs = 10;
   private List<String> words;
 
   public RNNCrossValidationTest(float learningRate, int seqLength, int hiddenLayerSize) {
@@ -61,12 +61,11 @@ public class RNNCrossValidationTest {
   @Parameterized.Parameters
   public static Collection<Object[]> data() {
     return Arrays.asList(new Object[][]{
-            {1e-1f, 50, 5},
-            {1e-1f, 50, 10},
             {1e-1f, 50, 15},
             {1e-1f, 50, 25},
             {1e-1f, 50, 50},
             {1e-1f, 50, 100},
+            {1e-1f, 50, 150},
     });
   }
 

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1768715&r1=1768714&r2=1768715&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Tue Nov  8 15:33:35 2016
@@ -95,4 +95,46 @@ In this paper , we propose a novel neura
 One RNN encodes a sequence of symbols into a fixedlength vector representation , and the other decodes the representation into another sequence of symbols .
 The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence .
 The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder–Decoder as an additional feature in the existing log-linear model .
-Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases .
\ No newline at end of file
+Qualitatively , we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases .
+Time series often have a temporal hierarchy , with information that is spread out over multiple time scales .
+Common recurrent neural networks , however, do not explicitly accommodate such a hierarchy , and most research on them has been focusing on training algorithms rather than on their basic architecture .
+In this paper we study the effect of a hierarchy of recurrent neural networks on processing time series .
+Here , each layer is a recurrent network which receives the hidden state of the previous layer as input .
+This architecture allows us to perform hierarchical processing on difficult temporal tasks , and more naturally capture the structure of time series .
+We show that they reach state-of-the-art performance for recurrent networks in character-level language modeling when trained with simple stochastic gradient descent .
+We also offer an analysis of the different emergent time scales .
+In this paper , we explore different ways to extend a recurrent neural network (RNN) to a deep RNN .
+We start by arguing that the concept of depth in an RNN is not as clear as it is in feedforward neural networks .
+By carefully analyzing and understanding the architecture of an RNN , however , we find three points of an RNN which may be made deeper ; (1) input-to-hidden function , (2) hidden-tohidden transition and (3) hidden-to-output function .
+Based on this observation , we propose two novel architectures of a deep RNN which are orthogonal to an earlier attempt of stacking multiple recurrent layers to build a deep RNN (Schmidhuber , 1992; El Hihi and Bengio , 1996) .
+We provide an alternative interpretation of these deep RNNs using a novel framework based on neural operators .
+The proposed deep RNNs are empirically evaluated on the tasks of polyphonic music prediction and language modeling .
+The experimental result supports our claim that the proposed deep RNNs benefit from the depth and outperform the conventional , shallow RNNs.
+Reasoning and inference are central to human and artificial intelligence .
+Modeling inference in human language is notoriously challenging but is fundamental to natural language understanding and many applications .
+With the availability of large annotated data , neural network models have recently advanced the field significantly .
+In this paper , we present a new state-of-the-art result , achieving the accuracy of 88.3% on the standard benchmark , the Stanford Natural Language Inference dataset .
+This result is achieved first through our enhanced sequential encoding model , which outperforms the previous best model that employs more complicated network architectures , suggesting that the potential of sequential LSTM-based models have not been fully explored yet in previous work .
+We further show that by explicitly considering recursive architectures , we achieve additional improvement .
+Particularly , incorporating syntactic parse information contributes to our best result ; it improves the performance even when the parse information is added to an already very strong system .
+We present a neural architecture for sequence processing .
+The ByteNet is a stack of two dilated convolutional neural networks , one to encode the source sequence and one to decode the target sequence , where the target network unfolds dynamically to generate variable length outputs .
+The ByteNet has two core properties : it runs in time that is linear in the length of the sequences and it preserves the sequences’ temporal resolution .
+The ByteNet decoder attains state-of-the-art performance on character-level language modelling and outperforms the previous best results obtained with recurrent neural networks .
+The ByteNet also achieves a performance on raw character-level machine translation that approaches that of the best neural translation models that run in quadratic time .
+The implicit structure learnt by the ByteNet mirrors the expected alignments between the sequences .
+The Teacher Forcing algorithm trains recurrent networks by supplying observed sequence values as inputs during training and using the network’s own one-stepahead predictions to do multi-step sampling .
+We introduce the Professor Forcing algorithm , which uses adversarial domain adaptation to encourage the dynamics of the recurrent network to be the same when training the network and when sampling from the network over multiple time steps .
+We apply Professor Forcing to language modeling , vocal synthesis on raw waveforms , handwriting generation , and image generation .
+Empirically we find that Professor Forcing acts as a regularizer , improving test likelihood on character level Penn Treebank and sequential MNIST .
+We also find that the model qualitatively improves samples , especially when sampling for a large number of time steps .
+This is supported by human evaluation of sample quality .
+Trade-offs between Professor Forcing and Scheduled Sampling are discussed .
+We produce T-SNEs showing that Professor Forcing successfully makes the dynamics of the network during training and sampling more similar .
+Most existing machine translation systems operate at the level of words , relying on explicit segmentation to extract tokens .
+We introduce a neural machine translation (NMT) model that maps a source character sequence to a target character sequence without any segmentation .
+We employ a character-level convolutional network with max-pooling at the encoder to reduce the length of source representation , allowing the model to be trained at a speed comparable to subword-level models while capturing local regularities .
+Our character-to-character model outperforms a recently proposed baseline with a subwordlevel encoder on WMT’15 DE-EN and CSEN , and gives comparable performance on FIEN and RU-EN .
+We then demonstrate that it is possible to share a single characterlevel encoder across multiple languages by training a model on a many-to-one translation task .
+In this multilingual setting , the character-level encoder significantly outperforms the subword-level encoder on all the language pairs .
+We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level translation even surpasses the models specifically trained on that language pair alone , both in terms of BLEU score and human judgment .
\ No newline at end of file



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org