You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/17 12:28:03 UTC

[14/14] incubator-joshua git commit: First BnEnHiero test working (rest following soon)

First BnEnHiero test working (rest following soon)

Added a script to convert grammar files. It needs to be run from $JOSHUA/joshua-core because it is not smart about fixing path names, and needs to look at the grammar files to figure out what class type to use. It could be rewritten better to actually build a JSON output that gets written; as it is, it just builds text. But it works!


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/dc6cf996
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/dc6cf996
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/dc6cf996

Branch: refs/heads/7_confsystem
Commit: dc6cf996772c0c58ce7098abee889d031f697505
Parents: 6f3e3c4
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Sep 17 14:27:27 2016 +0200
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Sep 17 14:27:27 2016 +0200

----------------------------------------------------------------------
 .../joshua/decoder/cky/BnEnDecodingTest.java    | 162 +++++++------------
 .../org/apache/joshua/decoder/cky/TestUtil.java | 118 +++++++-------
 scripts/compat/sevenize_my_conf_plz.py          | 114 +++++++++++++
 3 files changed, 232 insertions(+), 162 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc6cf996/joshua-core/src/test/java/org/apache/joshua/decoder/cky/BnEnDecodingTest.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/test/java/org/apache/joshua/decoder/cky/BnEnDecodingTest.java b/joshua-core/src/test/java/org/apache/joshua/decoder/cky/BnEnDecodingTest.java
index 82faa58..3bf3711 100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/cky/BnEnDecodingTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/cky/BnEnDecodingTest.java
@@ -1,23 +1,23 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *  http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.joshua.decoder.cky;
 
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+import org.testng.annotations.BeforeMethod;
+import static com.typesafe.config.ConfigFactory.parseResources;
 import static org.apache.joshua.decoder.cky.TestUtil.decodeList;
 import static org.apache.joshua.decoder.cky.TestUtil.loadStringsFromFile;
 import static org.testng.Assert.assertEquals;
@@ -25,97 +25,57 @@ import static org.testng.Assert.assertEquals;
 import java.util.List;
 
 import org.apache.joshua.decoder.Decoder;
-import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
 import org.apache.joshua.util.io.KenLmTestUtil;
-import org.testng.annotations.AfterMethod;
-import org.testng.annotations.Test;
-
-public class BnEnDecodingTest {
-
-	private JoshuaConfiguration joshuaConfig;
-	private Decoder decoder;
-
-	@AfterMethod
-	public void tearDown() throws Exception {
-		if(decoder != null) {
-			decoder.cleanUp();
-			decoder = null;
-		}
-	}
-
-	@Test
-	public void givenBnEnInput_whenPhraseDecoding_thenScoreAndTranslationCorrect() throws Exception {
-		// Given
-		List<String> inputStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/input.bn");
+import org.testng.annotations.DataProvider;
 
-		// When
-		configureDecoder("src/test/resources/bn-en/hiero/joshua.config");
-		List<String> decodedStrings = decodeList(inputStrings, decoder, joshuaConfig);
 
-		// Then
-		List<String> goldStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/output.gold");
-		assertEquals(decodedStrings, goldStrings);
-	}
 
-	@Test
-	public void givenBnEnInput_whenPhraseDecodingWithBerkeleyLM_thenScoreAndTranslationCorrect() throws Exception {
-		// Given
-		List<String> inputStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/input.bn");
+import com.typesafe.config.Config;
 
-		// When
-		configureDecoder("src/test/resources/bn-en/hiero/joshua-berkeleylm.config");
-		List<String> decodedStrings = decodeList(inputStrings, decoder, joshuaConfig);
-
-		// Then
-		List<String> goldStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/output-berkeleylm.gold");
-		assertEquals(decodedStrings, goldStrings);
-	}
-
-	@Test
-	public void givenBnEnInput_whenPhraseDecodingWithClassLM_thenScoreAndTranslationCorrect() throws Exception {
-		// Given
-		List<String> inputStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/input.bn");
-
-		// When
-		configureDecoder("src/test/resources/bn-en/hiero/joshua-classlm.config");
-		List<String> decodedStrings = decodeList(inputStrings, decoder, joshuaConfig);
-
-		// Then
-		List<String> goldStrings = loadStringsFromFile("src/test/resources/bn-en/hiero/output-classlm.gold");
-		assertEquals(decodedStrings, goldStrings);
-	}
-	
-	@Test
-	public void givenBnEnInput_whenPhraseDecodingWithPackedGrammar_thenScoreAndTranslationCorrect() throws Exception {
-		// Given
-		List<String> inputStrings = loadStringsFromFile("src/test/resources/bn-en/packed/input.bn");
-
-		// When
-		configureDecoder("src/test/resources/bn-en/packed/joshua.config");
-		List<String> decodedStrings = decodeList(inputStrings, decoder, joshuaConfig);
-
-		// Then
-		List<String> goldStrings = loadStringsFromFile("src/test/resources/bn-en/packed/output.gold");
-		assertEquals(decodedStrings, goldStrings);
-	}
-	
-	@Test
-	public void givenBnEnInput_whenPhraseDecodingWithSAMT_thenScoreAndTranslationCorrect() throws Exception {
-		// Given
-		List<String> inputStrings = loadStringsFromFile("src/test/resources/bn-en/samt/input.bn");
-
-		// When
-		configureDecoder("src/test/resources/bn-en/samt/joshua.config");
-		List<String> decodedStrings = decodeList(inputStrings, decoder, joshuaConfig);
+public class BnEnDecodingTest {
 
-		// Then
-		List<String> goldStrings = loadStringsFromFile("src/test/resources/bn-en/samt/output.gold");
-		assertEquals(decodedStrings, goldStrings);
-	}
-	
-	public void configureDecoder(String pathToConfig) throws Exception {
-		joshuaConfig = new JoshuaConfiguration();
-		joshuaConfig.readConfigFile(pathToConfig);
-		KenLmTestUtil.Guard(() -> decoder = new Decoder(joshuaConfig));
-	}
+  private Decoder decoder;
+  Translation translation = null;
+
+  @BeforeMethod
+  public void setUp() throws Exception {
+  }
+  
+  @DataProvider(name = "testFiles")
+  public Object[][] lmFiles() {
+    return new Object[][]{
+      {"BnEnHieroTest.conf", "BnEnHiero.in", "BnEnHieroTest.gold"},
+//      {"BnEnBerkeleyLMTest.conf", "BnEnHiero.in", "BnEnBerkeleyLMTest.gold"},
+//      {"BnEnClassLMTest.conf" , "BnEnHiero.in", "BnEnClassLMTest.gold"},
+//      {"BnEnPackedTest.conf", "BnEn.in", "BnEnPackedTest.gold"},
+//      {"BnEnSAMTTest.conf", "BnEn.in", "BnEnSAMTTest.gold"}
+      };
+  }
+  
+  @AfterMethod
+  public void tearDown() throws Exception {
+    if (decoder != null) {
+      decoder.cleanUp();
+      decoder = null;
+    }
+    translation = null;
+  }
+
+  @Test(dataProvider = "testFiles")
+  public void givenBnEnInput_whenDecoding_thenScoreAndTranslationCorrect(String confFile, String inFile, String goldFile) throws Exception {
+    // Given
+    List<String> inputStrings = loadStringsFromFile(this.getClass().getResource(inFile).getFile());
+
+    // When
+    Config config = parseResources(this.getClass(), confFile)
+        .withFallback(Decoder.getDefaultFlags());
+    decoder = new Decoder(config);
+
+    List<String> decodedStrings = decodeList(inputStrings, decoder);
+
+    // Then
+    List<String> goldStrings = loadStringsFromFile(this.getClass().getResource(goldFile).getFile());
+    assertEquals(decodedStrings, goldStrings);
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc6cf996/joshua-core/src/test/java/org/apache/joshua/decoder/cky/TestUtil.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/test/java/org/apache/joshua/decoder/cky/TestUtil.java b/joshua-core/src/test/java/org/apache/joshua/decoder/cky/TestUtil.java
index 640150e..0855e99 100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/cky/TestUtil.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/cky/TestUtil.java
@@ -1,20 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *  http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.joshua.decoder.cky;
 
@@ -28,57 +24,57 @@ import java.util.stream.Collectors;
 
 import org.apache.joshua.decoder.Decoder;
 import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.io.KenLmTestUtil;
 
-public class TestUtil {
+import com.typesafe.config.Config;
 
-	public static final String N_BEST_SEPARATOR = "\n";
+public class TestUtil {
 
-	/**
-	 * Loads a text file and returns a list containing one string per line
-	 * in the file.
-	 * @param pathToFile
-	 * @return
-	 * @throws IOException
-	 */
-	public static List<String> loadStringsFromFile(String pathToFile) throws IOException {
-		List<String> inputLines = Files.lines(Paths.get(pathToFile)).collect(Collectors.toList());
-		return inputLines;
-	}
+  public static final String N_BEST_SEPARATOR = "\n";
+  
+  /**
+   * Loads a text file and returns a list containing one string per line in the file.
+   * 
+   * @param pathToFile
+   * @return
+   * @throws IOException
+   */
+  public static List<String> loadStringsFromFile(String pathToFile) throws IOException {
+    List<String> inputLines = Files.lines(Paths.get(pathToFile)).collect(Collectors.toList());
+    return inputLines;
+  }
+  
+  /**
+   * 
+   * @param inputStrings A list of strings that should be decoded,
+   * @param decoder An initialized decoder,
+   * @param joshuaConfig The JoshuaConfiguration corresponding to the decoder.
+   * @return A list of decoded strings. If the decoder produces a n-best list (separated by
+   *         N_BEST_SEPARATOR), then each translation of the n-best list has its own entry in the
+   *         returned list.
+   */
+  public static List<String> decodeList(List<String> inputStrings, Decoder decoder) {
+    final List<String> decodedStrings = new ArrayList<>();
 
-	/**
-	 * 
-	 * @param inputStrings
-	 *            A list of strings that should be decoded,
-	 * @param decoder
-	 *            An initialized decoder,
-	 * @param joshuaConfig
-	 *            The JoshuaConfiguration corresponding to the decoder.
-	 * @return A list of decoded strings. If the decoder produces a n-best list
-	 *         (separated by N_BEST_SEPARATOR), then each translation of the
-	 *         n-best list has its own entry in the returned list.
-	 */
-	public static List<String> decodeList(List<String> inputStrings, Decoder decoder) {
-		final List<String> decodedStrings = new ArrayList<>();
+    for (String inputString : inputStrings) {
+      final Sentence sentence = new Sentence(inputString, 0, decoder.getFlags());
+      final String[] nBestList = decoder.decode(sentence).toString().split(N_BEST_SEPARATOR);
+      decodedStrings.addAll(Arrays.asList(nBestList));
+    }
 
-		for (String inputString : inputStrings) {
-			final Sentence sentence = new Sentence(inputString, 0, decoder.getFlags());
-			final String[] nBestList = decoder.decode(sentence).toString().split(N_BEST_SEPARATOR);
-			decodedStrings.addAll(Arrays.asList(nBestList));
-		}
+    return decodedStrings;
+  }
 
-		return decodedStrings;
-	}
-	
-	/**
-	 * Translates the given input string and returns the translation
-	 * converted into a string.
-	 * @param input
-	 * @param decoder
-	 * @param joshuaConfig
-	 * @return
-	 */
-	public static String translate(String input, Decoder decoder) {
-	    final Sentence sentence = new Sentence(input, 0, decoder.getFlags());
-	    return decoder.decode(sentence).toString();
-	}
+  /**
+   * Translates the given input string and returns the translation converted into a string.
+   * 
+   * @param input
+   * @param decoder
+   * @param joshuaConfig
+   * @return
+   */
+  public static String translate(String input, Decoder decoder) {
+    final Sentence sentence = new Sentence(input, 0, decoder.getFlags());
+    return decoder.decode(sentence).toString();
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc6cf996/scripts/compat/sevenize_my_conf_plz.py
----------------------------------------------------------------------
diff --git a/scripts/compat/sevenize_my_conf_plz.py b/scripts/compat/sevenize_my_conf_plz.py
new file mode 100755
index 0000000..30fbf71
--- /dev/null
+++ b/scripts/compat/sevenize_my_conf_plz.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python
+"""
+Converts Joshua 6 config files to the Joshua 7 format. It is not smart about paths,
+so make sure that you run it in the root directory of any relative paths.
+
+Usage:
+  cat joshua-v6.config | $JOSHUA/scripts/compat/sevenize_my_conf_plz.py \
+    > joshua.conf
+"""
+
+import os
+import re
+import sys
+
+weights = {}
+tms = []
+features = []
+
+def smooth_key(key):
+    return key.replace('-', '_').replace('maxspan', 'span_limit')
+
+def parse_args(line):
+    found = []
+    
+    """Assume the argument string is "-key value" pairs. Don't bother with error checking."""
+    tokens = line.split(' ')
+    type = tokens.pop(0)
+    for i in range(0, len(tokens), 2):
+        key = smooth_key(tokens[i][1:]) # strip leading -
+        val = tokens[i+1]
+
+        found.append('%s=%s' % (key, val))
+
+        if key == 'path':
+            if type == 'thrax' or type == 'hiero':
+                if os.path.isdir(val):
+                    type = 'PackedGrammar'
+                else:
+                    type = 'TextGrammar'
+
+    found.insert(0, 'class = %s' % (type))
+
+    return ", ".join(found)
+
+for line in sys.stdin:
+    line = line.rstrip()
+
+    if line.startswith('#') or re.match(r'^\s*$', line):
+        continue
+
+    if line.find('=') == -1:
+        name, weight = line.split(' ', 1)
+        weights[name] = weight
+
+    elif line.startswith('tm'):
+
+        _, tm = re.split(r'\s*=\s*', line, 1)
+
+        tms.append(parse_args(tm))
+
+    elif line.startswith('feature-function'):
+        _, feature = re.split(r'\s*=\s*', line, 1)
+
+        features.append(parse_args(feature))
+
+    else:
+        key, value = re.split(r'\s*=\s*', line, 1)
+        key = smooth_key(key)
+        print key, '=', value
+
+print
+print 'feature_functions = ['
+for feature in features:
+    print '  {', feature, '}'
+print ']'
+
+print
+print 'grammars = ['
+for tm in tms:
+    print '  {', tm, '}'
+print ']'
+
+print
+print 'weights = {'
+for weight in weights.keys():
+    print ' ', weight, '=', weights[weight]
+print '}'
+
+"""
+top_n = 0
+use_unique_nbest = false
+output_format = %s | %a
+
+
+feature_functions = [
+  {class=OOVPenalty}
+]
+
+grammars=[
+  {class=TextGrammar, owner=pt, span_limit=20, path=src/test/resources/wa_grammar}
+  {class=TextGrammar, owner=glue, span_limit=-1, path=src/test/resources/grammar.glue}
+]
+
+weights = {
+  pt_0=-1
+  pt_1=-1
+  pt_2=-1
+  pt_3=-1
+  pt_4=-1
+  pt_5=-1
+  glue_0=-1
+  OOVPenalty=2
+}
+"""