You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/13 16:53:13 UTC
[07/12] incubator-joshua git commit: Moved regression tests
bn-en/hiero to unit test
Moved regression tests bn-en/hiero to unit test
Moved the shell script regression tests from bn-en/hiero to a unit test class. Also cleaned up the corresponding resource directory. Regenerated gold output with "%c %s" format. Dropped test-filter.sh.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/de364bd5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/de364bd5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/de364bd5
Branch: refs/heads/master
Commit: de364bd5ceac03f51557fcb88ee50b6fdb549ffe
Parents: 49bbcac
Author: Michael A. Hedderich <mi...@users.noreply.github.com>
Authored: Tue Sep 13 17:32:31 2016 +0200
Committer: Michael A. Hedderich <mi...@users.noreply.github.com>
Committed: Tue Sep 13 17:32:31 2016 +0200
----------------------------------------------------------------------
.../joshua/decoder/cky/HieroDecodingTest.java | 121 ++
.../resources/bn-en/hiero/class_lm_2gram.gz | Bin 18052 -> 0 bytes
.../bn-en/hiero/joshua-berkeleylm.config | 27 +-
.../resources/bn-en/hiero/joshua-classlm.config | 31 +-
src/test/resources/bn-en/hiero/joshua.config | 28 +-
.../bn-en/hiero/output-berkeleylm.gold | 943 ++++++++++
.../resources/bn-en/hiero/output-classlm.gold | 1774 +++++++++---------
src/test/resources/bn-en/hiero/output.gold | 1610 ++++++++--------
src/test/resources/bn-en/hiero/output.gold.bleu | 14 -
.../bn-en/hiero/output.scores.berkeleylm.gold | 100 -
.../resources/bn-en/hiero/output.scores.gold | 805 --------
src/test/resources/bn-en/hiero/reference.en.0 | 100 -
src/test/resources/bn-en/hiero/reference.en.1 | 100 -
src/test/resources/bn-en/hiero/reference.en.2 | 100 -
src/test/resources/bn-en/hiero/reference.en.3 | 100 -
.../resources/bn-en/hiero/test-berkeleylm.sh | 33 -
src/test/resources/bn-en/hiero/test-classlm.sh | 32 -
src/test/resources/bn-en/hiero/test-filter.sh | 35 -
src/test/resources/bn-en/hiero/test.sh | 35 -
src/test/resources/bn-en/hiero/topN.pl | 18 -
20 files changed, 2799 insertions(+), 3207 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/de364bd5/src/test/java/org/apache/joshua/decoder/cky/HieroDecodingTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/decoder/cky/HieroDecodingTest.java b/src/test/java/org/apache/joshua/decoder/cky/HieroDecodingTest.java
new file mode 100644
index 0000000..7061d3b
--- /dev/null
+++ b/src/test/java/org/apache/joshua/decoder/cky/HieroDecodingTest.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.cky;
+
+import static org.testng.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.io.KenLmTestUtil;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+
+public class HieroDecodingTest {
+
+ private final static String N_BEST_SEPARATOR = "\n";
+
+ private JoshuaConfiguration joshuaConfig;
+ private Decoder decoder;
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ }
+
+ @Test
+ public void givenBnEnInput_whenPhraseDecoding_thenScoreAndTranslationCorrect() throws Exception {
+ // Given
+ List<String> inputStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/input.bn");
+
+ // When
+ configureDecoder("src/test/resources/bn-en/hiero/joshua.config");
+ List<String> decodedStrings = decodeList(inputStrings);
+
+ // Then
+ List<String> goldStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/output.gold");
+ assertEquals(decodedStrings, goldStrings);
+ }
+
+ @Test
+ public void givenBnEnInput_whenPhraseDecodingWithBerkeleyLM_thenScoreAndTranslationCorrect() throws Exception {
+ // Given
+ List<String> inputStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/input.bn");
+
+ // When
+ configureDecoder("src/test/resources/bn-en/hiero/joshua-berkeleylm.config");
+ List<String> decodedStrings = decodeList(inputStrings);
+
+ // Then
+ List<String> goldStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/output-berkeleylm.gold");
+ assertEquals(decodedStrings, goldStrings);
+ }
+
+ @Test
+ public void givenBnEnInput_whenPhraseDecodingWithClassLM_thenScoreAndTranslationCorrect() throws Exception {
+ // Given
+ List<String> inputStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/input.bn");
+
+ // When
+ configureDecoder("src/test/resources/bn-en/hiero/joshua-classlm.config");
+ List<String> decodedStrings = decodeList(inputStrings);
+
+ // Then
+ List<String> goldStrings = loadSentencesFromFile("src/test/resources/bn-en/hiero/output-classlm.gold");
+ assertEquals(decodedStrings, goldStrings);
+ }
+
+ private static List<String> loadSentencesFromFile(String pathToFile) throws IOException {
+ List<String> inputLines = Files.lines(Paths.get(pathToFile)).collect(Collectors.toList());
+ return inputLines;
+ }
+
+ private void configureDecoder(String pathToConfig) throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.readConfigFile(pathToConfig);
+ KenLmTestUtil.Guard(() -> decoder = new Decoder(joshuaConfig, ""));
+ }
+
+ /**
+ * @param inputStrings A list of strings that should be decoded
+ * @return A list of decoded strings. If the decoder produces
+ * an n-best list, then each translation of the n-best list
+ * has its own entry in the returned list.
+ */
+ private List<String> decodeList(List<String> inputStrings) {
+ final List<String> decodedStrings = new ArrayList<>();
+
+ for (String inputString : inputStrings) {
+ final Sentence sentence = new Sentence(inputString, 0, joshuaConfig);
+ final String[] nBestList = decoder.decode(sentence).toString().split(N_BEST_SEPARATOR);
+ decodedStrings.addAll(Arrays.asList(nBestList));
+ }
+
+ return decodedStrings;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/de364bd5/src/test/resources/bn-en/hiero/class_lm_2gram.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/bn-en/hiero/class_lm_2gram.gz b/src/test/resources/bn-en/hiero/class_lm_2gram.gz
deleted file mode 100644
index 27b6d1a..0000000
Binary files a/src/test/resources/bn-en/hiero/class_lm_2gram.gz and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/de364bd5/src/test/resources/bn-en/hiero/joshua-berkeleylm.config
----------------------------------------------------------------------
diff --git a/src/test/resources/bn-en/hiero/joshua-berkeleylm.config b/src/test/resources/bn-en/hiero/joshua-berkeleylm.config
index e1bf2f6..53c8fe4 100644
--- a/src/test/resources/bn-en/hiero/joshua-berkeleylm.config
+++ b/src/test/resources/bn-en/hiero/joshua-berkeleylm.config
@@ -1,29 +1,30 @@
-feature-function = LanguageModel -lm_type berkeleylm -lm_order 5 -lm_file lm.gz
+# feature functions
+feature-function = LanguageModel -lm_type berkeleylm -lm_order 5 -lm_file src/test/resources/bn-en/hiero/lm.gz
+feature-function = OOVPenalty
+feature-function = WordPenalty
-tm = thrax pt 12 grammar.gz
-tm = thrax glue -1 glue-grammar
+# tm
+tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/hiero/grammar.gz
+tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/hiero/glue-grammar
mark-oovs = false
-#tm config
+# tm config
default_non_terminal=X
goalSymbol=GOAL
-#pruning config
+# pruning config
pop-limit = 100
-#nbest config
+# nbest config
use_unique_nbest = true
-top_n = 1
+top_n = 10
-feature-function = OOVPenalty
-feature-function = WordPenalty
+# output format
+output-format = "%c %s"
-###### model weights
-#lm order weight
+# model weights
lm_0 1.2373676802179452
-
-#phrasemodel owner column(0-indexed) weight
tm_pt_0 -2.4497429277910214
tm_pt_1 0.7224581556224123
tm_pt_2 -0.31689069155153504
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/de364bd5/src/test/resources/bn-en/hiero/joshua-classlm.config
----------------------------------------------------------------------
diff --git a/src/test/resources/bn-en/hiero/joshua-classlm.config b/src/test/resources/bn-en/hiero/joshua-classlm.config
index 3be7392..d0bee40 100644
--- a/src/test/resources/bn-en/hiero/joshua-classlm.config
+++ b/src/test/resources/bn-en/hiero/joshua-classlm.config
@@ -1,32 +1,33 @@
-feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file lm.gz
-
-# Class LM feature
-feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 9 -lm_file class_lm_9gram.gz -class_map class.map
+# feature functions
+feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file src/test/resources/bn-en/hiero/lm.gz
+feature-function = WordPenalty
+feature-function = OOVPenalty
-###### Old format for lms
-# lm = kenlm 5 false false 100 lm.gz
+# class LM feature
+feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 9 -lm_file src/test/resources/bn-en/hiero/class_lm_9gram.gz -class_map src/test/resources/bn-en/hiero/class.map
-# tm = TYPE OWNER MAX_SPAN PATH
-tm = thrax pt 12 grammar.gz
-tm = thrax glue -1 glue-grammar
+# tm
+tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/hiero/grammar.gz
+tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/hiero/glue-grammar
mark_oovs=false
-#tm config
+# tm config
default_non_terminal=X
goalSymbol=GOAL
-#pruning config
+# pruning config
pop-limit = 10
-#nbest config
+# nbest config
use_unique_nbest=true
top_n = 10
-feature-function = WordPenalty
-feature-function = OOVPenalty
+# output format
+output-format = "%c %s"
+
-###### model weights
+# model weights
lm_0 1.2373676802179452
lm_1 1.2373676802179452
tm_pt_0 -2.4497429277910214
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/de364bd5/src/test/resources/bn-en/hiero/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/bn-en/hiero/joshua.config b/src/test/resources/bn-en/hiero/joshua.config
index 5a51698..3942a41 100644
--- a/src/test/resources/bn-en/hiero/joshua.config
+++ b/src/test/resources/bn-en/hiero/joshua.config
@@ -1,31 +1,29 @@
-feature-function = LanguageModel -lm_type kenlm -lm_order 5 -minimizing false -lm_file lm.gz
-
-###### Old format for lms
-# lm = kenlm 5 false false 100 lm.gz
+# feature functions
+feature-function = LanguageModel -lm_type kenlm -lm_order 5 -minimizing false -lm_file src/test/resources/bn-en/hiero/lm.gz
+feature-function = OOVPenalty
+feature-function = WordPenalty
-# tm = TYPE OWNER MAX_SPAN PATH
-#tm = thrax pt 12 grammar.gz
-#tm = thrax glue -1 glue-grammar
-tm = thrax -owner pt -maxspan 12 -path grammar.gz
-tm = thrax -owner glue -maxspan -1 -path glue-grammar
+# tm
+tm = thrax -owner pt -maxspan 12 -path src/test/resources/bn-en/hiero/grammar.gz
+tm = thrax -owner glue -maxspan -1 -path src/test/resources/bn-en/hiero/glue-grammar
mark_oovs=false
-#tm config
+# tm config
default_non_terminal=X
goalSymbol=GOAL
-#pruning config
+# pruning config
pop-limit = 10
-#nbest config
+# nbest config
use_unique_nbest=true
top_n = 10
-feature-function = OOVPenalty
-feature-function = WordPenalty
+# output format
+output-format = "%c %s"
-###### model weights
+# model weights
lm_0 1.2373676802179452
lm_1 1.2373676802179452
tm_pt_0 -2.4497429277910214