You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:07 UTC
[05/10] incubator-joshua git commit:
LanguageModelFF.estimateFutureCost refactorings and test
LanguageModelFF.estimateFutureCost refactorings and test
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b68ccaae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b68ccaae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b68ccaae
Branch: refs/heads/master
Commit: b68ccaae50139d97bce12c2b3fbc8a0132bcb235
Parents: 5b79128
Author: Pavel Danchenko <da...@amazon.com>
Authored: Tue Dec 22 12:49:22 2015 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Thu Apr 28 15:42:05 2016 -0700
----------------------------------------------------------------------
resources/berkeley_lm/lm | 16 ++++
resources/berkeley_lm/lm.berkeleylm | Bin 0 -> 4294 bytes
resources/berkeley_lm/lm.berkeleylm.gz | Bin 0 -> 1786 bytes
resources/berkeley_lm/lm.gz | Bin 0 -> 162 bytes
src/joshua/decoder/Decoder.java | 4 +
src/joshua/decoder/ff/lm/LanguageModelFF.java | 28 ++++---
.../decoder/ff/lm/LanguageModelFFTest.java | 76 +++++++++++++++++++
.../lm/berkeley_lm/LMGrammarBerkeleyTest.java | 62 +++++++++++++++
8 files changed, 174 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm b/resources/berkeley_lm/lm
new file mode 100644
index 0000000..05b4e6b
--- /dev/null
+++ b/resources/berkeley_lm/lm
@@ -0,0 +1,16 @@
+
+\data\
+ngram 1=5
+ngram 2=3
+
+\1-grams:
+-99.000000 <unk>
+-99.000000 <s> -1.752754
+-2.034158 the -0.800943
+-5.318589 chat-rooms -0.151088
+-1.495702 </s>
+
+\2-grams:
+-1.773970 <s> the
+-4.878868 the chat-rooms
+-0.499794 chat-rooms </s>
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.berkeleylm
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.berkeleylm b/resources/berkeley_lm/lm.berkeleylm
new file mode 100644
index 0000000..c048464
Binary files /dev/null and b/resources/berkeley_lm/lm.berkeleylm differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.berkeleylm.gz
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.berkeleylm.gz b/resources/berkeley_lm/lm.berkeleylm.gz
new file mode 100644
index 0000000..f9f8d16
Binary files /dev/null and b/resources/berkeley_lm/lm.berkeleylm.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/resources/berkeley_lm/lm.gz
----------------------------------------------------------------------
diff --git a/resources/berkeley_lm/lm.gz b/resources/berkeley_lm/lm.gz
new file mode 100644
index 0000000..ae47266
Binary files /dev/null and b/resources/berkeley_lm/lm.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 1b12dda..aab6d36 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -530,6 +530,10 @@ public class Decoder {
e.printStackTrace();
}
}
+ resetGlobalState();
+ }
+
+ public static void resetGlobalState() {
// clear/reset static variables
DENSE_FEATURE_NAMES.clear();
Vocabulary.clear();
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 38f1a74..a002de7 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -25,6 +25,8 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
+import com.google.common.primitives.Ints;
+
import joshua.corpus.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Support;
@@ -341,18 +343,12 @@ public class LanguageModelFF extends StatefulFF {
int[] leftContext = state.getLeftLMStateWords();
if (null != leftContext) {
- List<Integer> words = new ArrayList<Integer>();
- for (int w : leftContext)
- words.add(w);
-
- boolean considerIncompleteNgrams = true;
boolean skipStart = true;
- if (words.get(0) != startSymbolId) {
+ if (leftContext[0] != startSymbolId) {
skipStart = false;
}
- estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+ estimate += scoreChunkLogP(leftContext, true, skipStart);
}
-
return weight * estimate;
}
@@ -476,6 +472,15 @@ public class LanguageModelFF extends StatefulFF {
return new NgramDPState(leftContext, rightContext);
}
+
+ /**
+ * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)}
+ */
+ private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+ boolean skipStart) {
+ return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart);
+ }
+
/**
* This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It
* computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1
@@ -486,11 +491,11 @@ public class LanguageModelFF extends StatefulFF {
* @param skipStart
* @return the phrase log probability
*/
- private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+ private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams,
boolean skipStart) {
float score = 0.0f;
- if (words.size() > 0) {
+ if (words.length > 0) {
int startIndex;
if (!considerIncompleteNgrams) {
startIndex = this.ngramOrder;
@@ -499,8 +504,7 @@ public class LanguageModelFF extends StatefulFF {
} else {
startIndex = 1;
}
- score = this.languageModel.sentenceLogProbability(
- Support.subIntArray(words, 0, words.size()), this.ngramOrder, startIndex);
+ score = this.languageModel.sentenceLogProbability(words, this.ngramOrder, startIndex);
}
return score;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java b/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
new file mode 100644
index 0000000..64c9794
--- /dev/null
+++ b/tst/joshua/decoder/ff/lm/LanguageModelFFTest.java
@@ -0,0 +1,76 @@
+package joshua.decoder.ff.lm;
+
+import static org.junit.Assert.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.ff.state_maintenance.NgramDPState;
+
+public class LanguageModelFFTest {
+
+ private static final float WEIGHT = 0.5f;
+
+ private LanguageModelFF ff;
+
+ @Before
+ public void setUp() {
+ Decoder.resetGlobalState();
+
+ FeatureVector weights = new FeatureVector();
+ weights.set("lm_0", WEIGHT);
+ String[] args = {"-lm_type", "berkeleylm", "-lm_order", "2", "-lm_file", "./joshua/test/lm/berkeley/lm"};
+
+ JoshuaConfiguration config = new JoshuaConfiguration();
+ ff = new LanguageModelFF(weights, args, config);
+ }
+
+ @After
+ public void tearDown() {
+ Decoder.resetGlobalState();
+ }
+
+ @Test
+ public void givenNonStartSymbol_whenEstimateFutureCost_thenMultipleWeightAndLogProbabilty() {
+ int[] left = {3};
+ NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+
+ float score = ff.languageModel.sentenceLogProbability(left, 2, 1);
+ assertEquals(-99.0f, score, 0.0);
+
+ float cost = ff.estimateFutureCost(null, currentState, null);
+ assertEquals(score * WEIGHT, cost, 0.0);
+ }
+
+ @Test
+ public void givenOnlyStartSymbol_whenEstimateFutureCost_thenZeroResult() {
+ int startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+ int[] left = {startSymbolId};
+ NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+
+ float score = ff.languageModel.sentenceLogProbability(left, 2, 2);
+ assertEquals(0.0f, score, 0.0);
+
+ float cost = ff.estimateFutureCost(null, currentState, null);
+ assertEquals(score * WEIGHT, cost, 0.0);
+ }
+
+ @Test
+ public void givenStartAndOneMoreSymbol_whenEstimateFutureCost_thenMultipleWeightAndLogProbabilty() {
+ int startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+ assertNotEquals(startSymbolId, 3);
+ int[] left = {startSymbolId, 3};
+ NgramDPState currentState = new NgramDPState(left, new int[left.length]);
+
+ float score = ff.languageModel.sentenceLogProbability(left, 2, 2);
+ assertEquals(-100.752754f, score, 0.0f);
+
+ float cost = ff.estimateFutureCost(null, currentState, null);
+ assertEquals(score * WEIGHT, cost, 0.0f);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b68ccaae/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
new file mode 100644
index 0000000..6e0d90f
--- /dev/null
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -0,0 +1,62 @@
+package joshua.decoder.ff.lm.berkeley_lm;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * Replacement for test/lm/berkeley/test.sh regression test
+ */
+@RunWith(Parameterized.class)
+public class LMGrammarBerkeleyTest {
+
+ private static final String INPUT = "the chat-rooms";
+ private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");
+
+ private JoshuaConfiguration joshuaConfig;
+ private Decoder decoder;
+
+ @Parameters
+ public static List<String> lmFiles() {
+ return Arrays.asList("resources/berkeley_lm/lm",
+ "resources/berkeley_lm/lm.gz",
+ "resources/berkeley_lm/lm.berkeleylm",
+ "resources/berkeley_lm/lm.berkeleylm.gz");
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ }
+
+ @Parameter
+ public String lmFile;
+
+ @Test
+ public void verifyLM() {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.processCommandLineOptions(OPTIONS);
+ joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+ decoder = new Decoder(joshuaConfig, null);
+ String translation = decode(INPUT).toString();
+ assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+}