You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:07 UTC

[05/10] incubator-joshua git commit: LanguageModelFF.estimateFutureCost refactorings and test

LanguageModelFF.estimateFutureCost refactorings and test


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b68ccaae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b68ccaae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b68ccaae

Branch: refs/heads/master
Commit: b68ccaae50139d97bce12c2b3fbc8a0132bcb235
Parents: 5b79128
Author: Pavel Danchenko <da...@amazon.com>
Authored: Tue Dec 22 12:49:22 2015 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Thu Apr 28 15:42:05 2016 -0700

----------------------------------------------------------------------
 resources/berkeley_lm/lm                        |  16 ++++
 resources/berkeley_lm/lm.berkeleylm             | Bin 0 -> 4294 bytes
 resources/berkeley_lm/lm.berkeleylm.gz          | Bin 0 -> 1786 bytes
 resources/berkeley_lm/lm.gz                     | Bin 0 -> 162 bytes
 src/joshua/decoder/Decoder.java                 |   4 +
 src/joshua/decoder/ff/lm/LanguageModelFF.java   |  28 ++++---
 .../decoder/ff/lm/LanguageModelFFTest.java      |  76 +++++++++++++++++++
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |  62 +++++++++++++++
 8 files changed, 174 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm b/resources/berkeley_lm/lm
new file mode 100644
index 0000000..05b4e6b
--- /dev/null
+++ b/resources/berkeley_lm/lm
@@ -0,0 +1,16 @@
+
+\data\
+ngram 1=5
+ngram 2=3
+
+\1-grams:
+-99.000000	<unk>
+-99.000000	<s>	-1.752754
+-2.034158	the	-0.800943
+-5.318589	chat-rooms	-0.151088
+-1.495702	</s>
+
+\2-grams:
+-1.773970	<s> the
+-4.878868	the chat-rooms
+-0.499794	chat-rooms </s>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.berkeleylm
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.berkeleylm b/resources/berkeley_lm/lm.berkeleylm
new file mode 100644
index 0000000..c048464
Binary files /dev/null and b/resources/berkeley_lm/lm.berkeleylm differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.berkeleylm.gz
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.berkeleylm.gz b/resources/berkeley_lm/lm.berkeleylm.gz
new file mode 100644
index 0000000..f9f8d16
Binary files /dev/null and b/resources/berkeley_lm/lm.berkeleylm.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.gz
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.gz b/resources/berkeley_lm/lm.gz
new file mode 100644
index 0000000..ae47266
Binary files /dev/null and b/resources/berkeley_lm/lm.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 1b12dda..aab6d36 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -530,6 +530,10 @@ public class Decoder {
         e.printStackTrace();
       }
     }
+    resetGlobalState();
+  }
+  
+  public static void resetGlobalState() {
     // clear/reset static variables
     DENSE_FEATURE_NAMES.clear();
     Vocabulary.clear();

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 38f1a74..a002de7 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -25,6 +25,8 @@ import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 
+import com.google.common.primitives.Ints;
+
 import joshua.corpus.Vocabulary;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.Support;
@@ -341,18 +343,12 @@ public class LanguageModelFF extends StatefulFF {
     int[] leftContext = state.getLeftLMStateWords();
 
     if (null != leftContext) {
-      List<Integer> words = new ArrayList<Integer>();
-      for (int w : leftContext)
-        words.add(w);
-
-      boolean considerIncompleteNgrams = true;
       boolean skipStart = true;
-      if (words.get(0) != startSymbolId) {
+      if (leftContext[0] != startSymbolId) {
         skipStart = false;
       }
-      estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+      estimate += scoreChunkLogP(leftContext, true, skipStart);
     }
-
     return weight * estimate;
   }
 
@@ -476,6 +472,15 @@ public class LanguageModelFF extends StatefulFF {
     return new NgramDPState(leftContext, rightContext);
   }
 
+  
+  /**
+   * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)}
+   */
+  private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+      boolean skipStart) {
+    return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart);
+  }
+  
   /**
    * This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It
    * computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1
@@ -486,11 +491,11 @@ public class LanguageModelFF extends StatefulFF {
    * @param skipStart
    * @return the phrase log probability
    */
-  private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+  private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams,
       boolean skipStart) {
 
     float score = 0.0f;
-    if (words.size() > 0) {
+    if (words.length > 0) {
       int startIndex;
       if (!considerIncompleteNgrams) {
         startIndex = this.ngramOrder;
@@ -499,8 +504,7 @@ public class LanguageModelFF extends StatefulFF {
       } else {
         startIndex = 1;
       }
-      score = this.languageModel.sentenceLogProbability(
-          Support.subIntArray(words, 0, words.size()), this.ngramOrder, startIndex);
+      score = this.languageModel.sentenceLogProbability(words, this.ngramOrder, startIndex);
     }
 
     return score;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java b/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
new file mode 100644
index 0000000..64c9794
--- /dev/null
+++ b/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
@@ -0,0 +1,76 @@
+package joshua.decoder.ff.lm;
+
+import static org.junit.Assert.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.ff.state_maintenance.NgramDPState;
+
+public class LanguageModelFFTest {
+
+  private static final float WEIGHT = 0.5f;
+
+  private LanguageModelFF ff;
+  
+  @Before
+  public void setUp() {
+    Decoder.resetGlobalState();
+    
+    FeatureVector weights = new FeatureVector();
+    weights.set("lm_0", WEIGHT);
+    String[] args = {"-lm_type", "berkeleylm", "-lm_order", "2", "-lm_file", "./joshua/test/lm/berkeley/lm"};
+    
+    JoshuaConfiguration config = new JoshuaConfiguration();
+    ff = new LanguageModelFF(weights, args, config);
+  }
+  
+  @After
+  public void tearDown() {
+    Decoder.resetGlobalState();
+  }
+  
+  @Test
+  public void givenNonStartSymbol_whenEstimateFutureCost_thenMultipleWeightAndLogProbabilty() {
+    int[] left = {3};
+    NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+    
+    float score = ff.languageModel.sentenceLogProbability(left, 2, 1);
+    assertEquals(-99.0f, score, 0.0);
+    
+    float cost = ff.estimateFutureCost(null, currentState, null);
+    assertEquals(score * WEIGHT, cost, 0.0);
+  }
+  
+  @Test
+  public void givenOnlyStartSymbol_whenEstimateFutureCost_thenZeroResult() {
+    int startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+    int[] left = {startSymbolId};
+    NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+    
+    float score = ff.languageModel.sentenceLogProbability(left, 2, 2);
+    assertEquals(0.0f, score, 0.0);
+    
+    float cost = ff.estimateFutureCost(null, currentState, null);
+    assertEquals(score * WEIGHT, cost, 0.0);
+  }
+  
+  @Test
+  public void givenStartAndOneMoreSymbol_whenEstimateFutureCost_thenMultipleWeightAndLogProbabilty() {
+    int startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+    assertNotEquals(startSymbolId, 3);
+    int[] left = {startSymbolId, 3};
+    NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+    
+    float score = ff.languageModel.sentenceLogProbability(left, 2, 2);
+    assertEquals(-100.752754f, score, 0.0f);
+    
+    float cost = ff.estimateFutureCost(null, currentState, null);
+    assertEquals(score * WEIGHT, cost, 0.0f);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
new file mode 100644
index 0000000..6e0d90f
--- /dev/null
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -0,0 +1,62 @@
+package joshua.decoder.ff.lm.berkeley_lm;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * Replacement for test/lm/berkeley/test.sh regression test
+ */
+@RunWith(Parameterized.class)
+public class LMGrammarBerkeleyTest {
+
+  private static final String INPUT = "the chat-rooms";
+  private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");
+  
+  private JoshuaConfiguration joshuaConfig;
+  private Decoder decoder;
+  
+  @Parameters
+  public static List<String> lmFiles() {
+    return Arrays.asList("resources/berkeley_lm/lm", 
+        "resources/berkeley_lm/lm.gz", 
+        "resources/berkeley_lm/lm.berkeleylm", 
+        "resources/berkeley_lm/lm.berkeleylm.gz");
+  }
+  
+  @After
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+  }
+  
+  @Parameter
+  public String lmFile;
+  
+  @Test
+  public void verifyLM() {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.processCommandLineOptions(OPTIONS);
+    joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+    decoder = new Decoder(joshuaConfig, null);
+    String translation = decode(INPUT).toString();
+    assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);
+  }
+  
+  private Translation decode(String input) {
+    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+    return decoder.decode(sentence);
+  }
+}