You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by to...@apache.org on 2016/03/11 15:35:40 UTC

svn commit: r1734572 - in /labs/yay/trunk/core/src: main/java/org/apache/yay/SkipGramNetwork.java test/java/org/apache/yay/SkipGramNetworkTest.java test/resources/word2vec/abstracts.txt test/resources/word2vec/sentences.txt

Author: tommaso
Date: Fri Mar 11 14:35:40 2016
New Revision: 1734572

URL: http://svn.apache.org/viewvc?rev=1734572&view=rev
Log:
skipgram using configurable mini batch

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java
    labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
    labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java Fri Mar 11 14:35:40 2016
@@ -48,7 +48,7 @@ import java.util.regex.Pattern;
 
 /**
  * A skip-gram neural network.
- * It learns its weights through backpropagation algorithm via batch gradient descent applied to a collection of
+ * It learns its weights through backpropagation algorithm via (configurable) mini batch gradient descent applied to a collection of
  * hot encoded training samples.
  */
 public class SkipGramNetwork {
@@ -132,19 +132,19 @@ public class SkipGramNetwork {
     double cc = 0;
     double wc = 0;
     int window = network.configuration.window;
+    List<String> vocabulary = network.getVocabulary();
+    Collection<Integer> exps = new LinkedList<>();
+    Collection<Integer> acts = new LinkedList<>();
     for (Sample sample : network.samples) {
-      Collection<Integer> exps = new ArrayList<>(window - 1);
-      Collection<Integer> acts = new ArrayList<>(window - 1);
       double[] inputs = sample.getInputs();
-      double[] actualOutputs = network.predictOutput(inputs);
-      double[] expectedOutputs = sample.getOutputs();
       int j = 0;
       for (int i = 0; i < window - 1; i++) {
-        int actualMax = getMaxIndex(actualOutputs, j, j + inputs.length - 1);
-        int expectedMax = getMaxIndex(expectedOutputs, j, j + inputs.length - 1);
-        exps.add(expectedMax % inputs.length);
-        acts.add(actualMax % inputs.length);
-        j += inputs.length;
+        int le = inputs.length;
+        int actualMax = getMaxIndex(network.predictOutput(inputs), j, j + le - 1);
+        int expectedMax = getMaxIndex(sample.getOutputs(), j, j + le - 1);
+        exps.add(expectedMax % le);
+        acts.add(actualMax % le);
+        j += le;
       }
       boolean c = true;
       for (Integer e : exps) {
@@ -152,7 +152,6 @@ public class SkipGramNetwork {
       }
       if (c) {
         cc++;
-        List<String> vocabulary = network.getVocabulary();
         String x = vocabulary.get(getMaxIndex(inputs, 0, inputs.length));
         StringBuilder y = new StringBuilder();
         for (int e : exps) {
@@ -165,7 +164,9 @@ public class SkipGramNetwork {
       } else {
         wc++;
       }
-
+      acts.clear();
+      exps.clear();
+      if (cc + wc > 2000) break;
     }
     return (cc / (wc + cc));
   }
@@ -210,10 +211,10 @@ public class SkipGramNetwork {
   }
 
 
-  // --- batch gradient descent ---
+  // --- mini batch gradient descent ---
 
   /**
-   * perform weights learning from the training examples using batch gradient descent algorithm
+   * perform weights learning from the training examples using (configurable) mini batch gradient descent algorithm
    *
    * @param samples the training examples
    * @return the final cost with the updated weights
@@ -225,14 +226,7 @@ public class SkipGramNetwork {
 
     double cost = Double.MAX_VALUE;
 
-    RealMatrix x = MatrixUtils.createRealMatrix(samples.length, samples[0].getInputs().length);
-    RealMatrix y = MatrixUtils.createRealMatrix(samples.length, samples[0].getOutputs().length);
-    int i = 0;
-    for (Sample sample : samples) {
-      x.setRow(i, ArrayUtils.addAll(sample.getInputs()));
-      y.setRow(i, ArrayUtils.addAll(sample.getOutputs()));
-      i++;
-    }
+    int j = 0;
 
     // momentum
     RealMatrix vb = MatrixUtils.createRealMatrix(biases[0].getRowDimension(), biases[0].getColumnDimension());
@@ -241,17 +235,25 @@ public class SkipGramNetwork {
     RealMatrix vw2 = MatrixUtils.createRealMatrix(weights[1].getRowDimension(), weights[1].getColumnDimension());
 
     long start = System.currentTimeMillis();
+    int c = 1;
     while (true) {
 
-      long time = (System.currentTimeMillis() - start) / 1000;
-      if (iterations % (1 + (configuration.maxIterations / 100)) == 0 || time % 300 < 2) {
-        if (time > 60) {
-          System.out.println("cost is " + cost + " after " + iterations + " iterations in " + (time / 60) + " minutes (" + ((double) iterations / time) + " ips)");
-        }
+      RealMatrix x = MatrixUtils.createRealMatrix(configuration.batchSize, samples[0].getInputs().length);
+      RealMatrix y = MatrixUtils.createRealMatrix(configuration.batchSize, samples[0].getOutputs().length);
+      int i = 0;
+      for (int k = j * configuration.batchSize; k < j * configuration.batchSize + configuration.batchSize; k++) {
+        Sample sample = samples[k % samples.length];
+        x.setRow(i, ArrayUtils.addAll(sample.getInputs()));
+        y.setRow(i, ArrayUtils.addAll(sample.getOutputs()));
+        i++;
       }
 
-      if (iterations % 100 == 0) {
-        System.out.println("accuracy: " + evaluate(this));
+      long time = (System.currentTimeMillis() - start) / 1000;
+      if (iterations % (1 + (configuration.maxIterations / 100)) == 0 || time % 300 == 0) {
+        if (time > 60 * c) {
+          c += 1;
+          System.out.println("cost: " + cost + ", accuracy: " + evaluate(this) + " after " + iterations + " iterations in " + (time / 60) + " minutes (" + ((double) iterations / time) + " ips)");
+        }
       }
 
       RealMatrix w0t = weights[0].transpose();
@@ -933,6 +935,7 @@ public class SkipGramNetwork {
     protected int window;
     protected boolean useMomentum;
     protected boolean useNesterovMomentum;
+    protected int batchSize;
   }
 
   public static class Builder {
@@ -942,6 +945,10 @@ public class SkipGramNetwork {
       this.configuration = new Configuration();
     }
 
+    public Builder withBatchSize(int batchSize) {
+      this.configuration.batchSize = batchSize;
+      return this;
+    }
 
     public Builder withWindow(int w) {
       this.configuration.window = w;
@@ -979,7 +986,6 @@ public class SkipGramNetwork {
     }
 
     public Builder useNesterovMomentum(boolean useNesterovMomentum) {
-      this.configuration.useMomentum = false;
       this.configuration.useNesterovMomentum = useNesterovMomentum;
       return this;
     }
@@ -1010,6 +1016,10 @@ public class SkipGramNetwork {
         this.configuration.maxIterations = trainingSet.size() * 100000;
       }
 
+      if (this.configuration.batchSize == 0) {
+        this.configuration.batchSize = trainingSet.size();
+      }
+
       HotEncodedSample next = trainingSet.iterator().next();
 
       this.configuration.inputs = next.getInputs().length;
@@ -1078,6 +1088,20 @@ public class SkipGramNetwork {
 
       Splitter splitter = Splitter.on(Pattern.compile("[\\n\\s]")).omitEmptyStrings().trimResults();
 
+      if (Files.isDirectory(path)) {
+        for (Path p : Files.newDirectoryStream(path)) {
+          addFragments(p, w, fragments, splitter);
+        }
+      } else {
+        addFragments(path, w, fragments, splitter);
+      }
+      long end = System.currentTimeMillis();
+      System.out.println("fragments read in " + (end - start) / 60000 + " minutes (" + fragments.size() + ")");
+      return fragments;
+
+    }
+
+    private void addFragments(Path path, int w, Queue<List<byte[]>> fragments, Splitter splitter) {
       ByteBuffer buffer = ByteBuffer.allocate(1);
       try (SeekableByteChannel inChannel = Files.newByteChannel(path)) {
 
@@ -1117,10 +1141,6 @@ public class SkipGramNetwork {
       } finally {
         buffer.clear();
       }
-      long end = System.currentTimeMillis();
-      System.out.println("fragments read in " + (end - start) / 60000 + " minutes (" + fragments.size() + ")");
-      return fragments;
-
     }
 
     private Queue<List<byte[]>> getFragmentsOld(Path path, int w) throws IOException {

Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java (original)
+++ labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java Fri Mar 11 14:35:40 2016
@@ -47,11 +47,12 @@ public class SkipGramNetworkTest {
             withWindow(3).
             fromTextAt(path).
             withDimension(10).
-            withAlpha(0.0003).
+            withAlpha(0.01).
             withLambda(0.0001).
             useNesterovMomentum(true).
             withMu(0.9).
-            withMaxIterations(500).
+            withMaxIterations(30000).
+            withBatchSize(10).
             build();
     RealMatrix wv = network.getWeights()[0];
     List<String> vocabulary = network.getVocabulary();
@@ -67,11 +68,12 @@ public class SkipGramNetworkTest {
             withWindow(3).
             fromTextAt(path).
             withDimension(10).
-            withAlpha(0.007).
-            withLambda(0.001).
-            useMomentum(true).
-            withMu(0.7).
-            withMaxIterations(500).
+            withAlpha(0.01).
+            withLambda(0.0001).
+            useNesterovMomentum(true).
+            withMu(0.9).
+            withMaxIterations(30000).
+            withBatchSize(1).
             build();
     RealMatrix wv = network.getWeights()[0];
     List<String> vocabulary = network.getVocabulary();

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Fri Mar 11 14:35:40 2016
@@ -2,7 +2,7 @@ A calculus which combined the flexible g
 With this goal in mind , we present a formulation for logical connectives in vector spaces based on standard linear algebra , giving examples of the use of vector negation to discriminate between different senses of ambiguous words .
 It turns out that the operators developed in this way are precisely the connectives of quantum logic ( Birkhoff and von Neumann , 1936 ) , which to our knowledge have not been exploited before in natural language processing .
 In quantum logic , arbitrary sets are replaced by linear subspaces of a vector space , and set unions , intersections and complements are replaced by vector sum , intersection and orthogonal complements of subspaces .
-We demonstrate that these logical connectives (particularly the orthogonal complement for negation) are powerful tools for exploring and analysing word meanings and show distinct advantages over Boolean operators in document retrieval experiments . 
+We demonstrate that these logical connectives ( particularly the orthogonal complement for negation )  are powerful tools for exploring and analysing word meanings and show distinct advantages over Boolean operators in document retrieval experiments .
 This paper is organised as follows . 
 In Section 1.1 we describe some of the ways vectors have been used to represent the meanings of terms and documents in natural language processing , and describe the way the WORD-SPACE used in our later experiments is built automatically from text corpora .
 In Section 1.2 we define the logical connectives on vector spaces , focussing particularly on negation and disjunction . 
@@ -16,12 +16,12 @@ Secondly , the link with ‘quantum l
 We propose two novel model architectures for computing continuous vector representations of words from very large data sets The quality of these representations is measured in a word similarity task , and the results are compared to the previously best performing techniques based on different types of neural networks .
 We observe large improvements in accuracy at much lower computational cost , i . e  it takes less than a day to learn high quality word vectors from a 1.6 billion words data set .
 Furthermore , we show that these vectors provide state-of-the-art performance on our test set for measuring syntactic and semantic word similarities . 
-Information Retrieval (IR) models need to deal with two difficult issues , vocabulary mismatch and term dependencies .
+Information Retrieval ( IR)  models need to deal with two difficult issues , vocabulary mismatch and term dependencies .
 Vocabulary mismatch corresponds to the difficulty of retrieving relevant documents that do not contain exact query terms but semantically related terms .
 Term dependencies refers to the need of considering the relationship between the words of the query when estimating the relevance of a document .
 A multitude of solutions has been proposed to solve each of these two problems , but no principled model solve both .
 In parallel , in the last few years , language models based on neural networks have been used to cope with complex natural language processing tasks like emotion and paraphrase detection .
-Although they present good abilities to cope with both term dependencies and vocabulary mismatch problems , thanks to the distributed representation of words they are based upon , such models could not be used readily in IR , where the estimation of one language model per document (or query) is required .
+Although they present good abilities to cope with both term dependencies and vocabulary mismatch problems , thanks to the distributed representation of words they are based upon , such models could not be used readily in IR , where the estimation of one language model per document ( or query)  is required .
 This is both computationally unfeasible and prone to over-fitting .
 Based on a recent work that proposed to learn a generic language model that can be modified through a set of document-specific parameters , we explore use of new neural network models that are adapted to ad-hoc IR tasks .
 Within the language model IR framework , we propose and study the use of a generic language model as well as a document-specific language model .

Modified: labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt
URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt Fri Mar 11 14:35:40 2016
@@ -24,12 +24,12 @@ However most of these models are built w
 This is problematic because words are often polysemous and global context can also provide useful information for learning word meanings .
 We present a new neural network architecture which 1) learns word embeddings that better capture the semantics of words by incorporating both local and global document context and 2) accounts for homonymy and polysemy by learning multiple embeddings per word .
 We introduce a new dataset with human judgments on pairs of words in sentential context and evaluate our model on it showing that our model outperforms competitive baselines and other neural language models .
-Information Retrieval (IR) models need to deal with two difficult issues vocabulary mismatch and term dependencies .
+Information Retrieval ( IR ) models need to deal with two difficult issues vocabulary mismatch and term dependencies .
 Vocabulary mismatch corresponds to the difficulty of retrieving relevant documents that do not contain exact query terms but semantically related terms .
 Term dependencies refers to the need of considering the relationship between the words of the query when estimating the relevance of a document .
 A multitude of solutions has been proposed to solve each of these two problems but no principled model solve both .
 In parallel in the last few years language models based on neural networks have been used to cope with complex natural language processing tasks like emotion and paraphrase detection .
-Although they present good abilities to cope with both term dependencies and vocabulary mismatch problems thanks to the distributed representation of words they are based upon such models could not be used readily in IR where the estimation of one language model per document (or query) is required .
+Although they present good abilities to cope with both term dependencies and vocabulary mismatch problems thanks to the distributed representation of words they are based upon such models could not be used readily in IR where the estimation of one language model per document ( or query ) is required .
 This is both computationally unfeasible and prone to over-fitting .
 Based on a recent work that proposed to learn a generic language model that can be modified through a set of document-specific parameters we explore use of new neural network models that are adapted to ad-hoc IR tasks .
 Within the language model IR framework we propose and study the use of a generic language model as well as a document-specific language model .
@@ -38,7 +38,7 @@ We experiment with such models and analy
 The word2vec model and application by Mikolov et al have attracted a great amount of attention in recent two years .
 The vector representations of words learned by word2vec models have been proven to be able to carry semantic meanings and are useful in various NLP tasks .
 As an increasing number of researchers would like to experiment with word2vec I notice that there lacks a material that comprehensively explains the parameter learning process of word2vec in details thus preventing many people with less neural network experience from understanding how exactly word2vec works .
-This note provides detailed derivations and explanations of the parameter update equations for the word2vec models including the original continuous bag-of-word (CBOW) and skip-gram models as well as advanced tricks hierarchical soft-max and negative sampling .
+This note provides detailed derivations and explanations of the parameter update equations for the word2vec models including the original continuous bag-of-word ( CBOW ) and skip-gram models as well as advanced tricks hierarchical soft-max and negative sampling .
 In the appendix a review is given on the basics of neuron network models and backpropagation .
 To avoid the inaccuracy caused by classifying the example into several categories given by TREC manually we take the word2vec to represent all attractions and user contexts in the continuous vector space learnt by neural network language models .
 The base of NNML is using neural networks for the probability function .



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org