You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/26 23:55:40 UTC

[4/9] incubator-joshua git commit: Update existing unit tests to proper decoder/vocab cleanup. Also added two core tests to UnitTests to test phrase-based decoding and kbest extraction.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/config
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/config b/resources/phrase_decoder/config
new file mode 100644
index 0000000..de781e3
--- /dev/null
+++ b/resources/phrase_decoder/config
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path resources/phrase_decoder/rules.1.gz -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file resources/phrase_decoder/lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/constrained.config
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/constrained.config b/resources/phrase_decoder/constrained.config
new file mode 100644
index 0000000..4642650
--- /dev/null
+++ b/resources/phrase_decoder/constrained.config
@@ -0,0 +1,28 @@
+tm = moses pt 0 resources/phrase_decoder/rules.1.gz
+
+lm = kenlm 5 true false 100 resources/phrase_decoder/lm.1.gz
+
+mark-oovs = false
+pop-limit = 10
+top-n = 5
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 10
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/constrained.output.gold
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/constrained.output.gold b/resources/phrase_decoder/constrained.output.gold
new file mode 100644
index 0000000..238387c
--- /dev/null
+++ b/resources/phrase_decoder/constrained.output.gold
@@ -0,0 +1,5 @@
+0 ||| President Obama |8-8| to |7-7| hinder |4-4| a strategy |0-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-15.792 tm_pt_1=-17.550 tm_pt_2=-14.599 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=7.000 ||| -15.163
+0 ||| President Obama |8-8| to |7-7| hinder |4-4| a |0-0| strategy |1-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.919 tm_pt_1=-17.550 tm_pt_2=-14.917 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=8.000 ||| -15.505
+0 ||| President Obama |8-8| to hinder |3-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-14.986 tm_pt_1=-17.951 tm_pt_2=-14.075 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=6.000 ||| -15.762
+0 ||| President Obama |8-8| to hinder |3-4| a |0-0| strategy |1-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.112 tm_pt_1=-17.951 tm_pt_2=-14.393 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.103
+0 ||| President Obama |8-8| to |3-3| hinder |4-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.329 tm_pt_1=-17.951 tm_pt_2=-15.136 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.257

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/lm.1.gz
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/lm.1.gz b/resources/phrase_decoder/lm.1.gz
new file mode 100644
index 0000000..3f4c453
Binary files /dev/null and b/resources/phrase_decoder/lm.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/output.gold
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/output.gold b/resources/phrase_decoder/output.gold
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/resources/phrase_decoder/output.gold
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/resources/phrase_decoder/rules.1.gz
----------------------------------------------------------------------
diff --git a/resources/phrase_decoder/rules.1.gz b/resources/phrase_decoder/rules.1.gz
new file mode 100644
index 0000000..14466e9
Binary files /dev/null and b/resources/phrase_decoder/rules.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java b/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
new file mode 100644
index 0000000..af6d670
--- /dev/null
+++ b/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
@@ -0,0 +1,62 @@
+package joshua.decoder.kbest_extraction;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Reimplements the kbest extraction regression test
+ * TODO (fhieber): this test strangely only works with StateMinimizing KenLM.
+ * This is to be investigated
+ */
+public class KBestExtractionTest {
+  
+  private static final String CONFIG = "resources/kbest_extraction/joshua.config";
+  private static final String INPUT = "a b c d e";
+  private static final Path GOLD_PATH = Paths.get("resources/kbest_extraction/output.scores.gold");
+  
+  private JoshuaConfiguration joshuaConfig = null;
+  private Decoder decoder = null;
+  
+  @Before
+  public void setUp() throws Exception {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.readConfigFile(CONFIG);
+    joshuaConfig.outputFormat = "%i ||| %s ||| %c";
+    decoder = new Decoder(joshuaConfig, "");
+  }
+  
+  @After
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+    decoder = null;
+  }
+  
+  @Test
+  public void givenInput_whenKbestExtraction_thenOutputIsAsExpected() throws IOException {
+    final String translation = decode(INPUT).toString();
+    final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
+    assertEquals(gold, translation);
+  }
+  
+  private Translation decode(String input) {
+    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+    return decoder.decode(sentence);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java b/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
new file mode 100644
index 0000000..6508a63
--- /dev/null
+++ b/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
@@ -0,0 +1,59 @@
+package joshua.decoder.phrase.constrained;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Reimplements the constrained phrase decoding test
+ */
+public class ConstrainedPhraseDecodingTest {
+  
+  private static final String CONFIG = "resources/phrase_decoder/constrained.config";
+  private static final String INPUT = "una estrategia republicana para obstaculizar la reelección de Obama ||| President Obama to hinder a strategy for Republican re @-@ election";
+  private static final Path GOLD_PATH = Paths.get("resources/phrase_decoder/constrained.output.gold");
+  
+  private JoshuaConfiguration joshuaConfig = null;
+  private Decoder decoder = null;
+  
+  @Before
+  public void setUp() throws Exception {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.readConfigFile(CONFIG);
+    decoder = new Decoder(joshuaConfig, "");
+  }
+  
+  @After
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+    decoder = null;
+  }
+  
+  @Test
+  public void givenInput_whenConstrainedPhraseDecoding_thenOutputIsAsExpected() throws IOException {
+    final String translation = decode(INPUT).toString();
+    final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
+    assertEquals(gold, translation);
+  }
+  
+  private Translation decode(String input) {
+    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+    return decoder.decode(sentence);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java b/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
new file mode 100644
index 0000000..707fa1b
--- /dev/null
+++ b/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
@@ -0,0 +1,59 @@
+package joshua.decoder.phrase.decode;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Reimplements the constrained phrase decoding test
+ */
+public class PhraseDecodingTest {
+  
+  private static final String CONFIG = "resources/phrase_decoder/config";
+  private static final String INPUT = "una estrategia republicana para obstaculizar la reelección de Obama";
+  private static final Path GOLD_PATH = Paths.get("resources/phrase_decoder/output.gold");
+  
+  private JoshuaConfiguration joshuaConfig = null;
+  private Decoder decoder = null;
+  
+  @Before
+  public void setUp() throws Exception {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.readConfigFile(CONFIG);
+    decoder = new Decoder(joshuaConfig, "");
+  }
+  
+  @After
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+    decoder = null;
+  }
+  
+  @Test
+  public void givenInput_whenPhraseDecoding_thenOutputIsAsExpected() throws IOException {
+    final String translation = decode(INPUT).toString();
+    final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
+    assertEquals(gold, translation);
+  }
+  
+  private Translation decode(String input) {
+    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+    return decoder.decode(sentence);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/system/AlignmentMapTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/AlignmentMapTest.java b/tst/joshua/system/AlignmentMapTest.java
index 0eee8c8..7d383a8 100644
--- a/tst/joshua/system/AlignmentMapTest.java
+++ b/tst/joshua/system/AlignmentMapTest.java
@@ -22,6 +22,7 @@ public class AlignmentMapTest {
 
   @Before
   public void setUp() throws Exception {
+    Vocabulary.clear();
     int[] sourceRhs = {Vocabulary.id("A1"),Vocabulary.id("A2"),-1,Vocabulary.id("B"),Vocabulary.id("C"),-2};
     int[] targetRhs = {Vocabulary.id("c"),Vocabulary.id("b1"),-1,Vocabulary.id("b2"),-4,Vocabulary.id("a")};
     int arity = 2; // 2 non terminals

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/KenLmTest.java b/tst/joshua/system/KenLmTest.java
new file mode 100644
index 0000000..8445410
--- /dev/null
+++ b/tst/joshua/system/KenLmTest.java
@@ -0,0 +1,48 @@
+package joshua.system;
+
+import static org.junit.Assert.assertEquals;
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.ff.lm.KenLM;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Integration test for KenLM integration into Joshua This test will setup a
+ * Joshua instance that loads libkenlm.so
+ *
+ * @author kellens
+ */
+public class KenLmTest {
+
+  @Test
+  public void givenKenLmUsed_whenTranslationsCalled_thenVerifyJniWithSampleCall() {
+    // GIVEN
+    String languageModelPath = "resources/kenlm/oilers.kenlm";
+
+    // WHEN
+    KenLM kenLm = new KenLM(3, languageModelPath);
+    Vocabulary.registerLanguageModel(kenLm);
+    int[] words = Vocabulary.addAll("Wayne Gretzky");
+    float probability = kenLm.prob(words);
+
+    // THEN
+    assertEquals("Found the wrong probability for 2-gram \"Wayne Gretzky\"", -0.99f, probability,
+        Float.MIN_VALUE);
+  }
+  
+  @Before
+  public void setUp() throws Exception {
+    Vocabulary.clear();
+    Vocabulary.unregisterLanguageModels();
+  }
+  
+  @After
+  public void tearDown() throws Exception {
+    Vocabulary.clear();
+    Vocabulary.unregisterLanguageModels();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fdc4cd7/tst/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java
index b8d8af0..679b254 100644
--- a/tst/joshua/system/MultithreadedTranslationTests.java
+++ b/tst/joshua/system/MultithreadedTranslationTests.java
@@ -33,7 +33,6 @@ public class MultithreadedTranslationTests {
 
   @Before
   public void setUp() throws Exception {
-    Vocabulary.clear();
     joshuaConfig = new JoshuaConfiguration();
     joshuaConfig.search_algorithm = "cky";
     joshuaConfig.mark_oovs = false;
@@ -71,7 +70,6 @@ public class MultithreadedTranslationTests {
 
   @After
   public void tearDown() throws Exception {
-    Vocabulary.clear();
     this.decoder.cleanUp();
     this.decoder = null;
     Decoder.VERBOSE = previousLogLevel;