You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/14 18:20:05 UTC
[06/10] incubator-joshua git commit: Moved regression test decoder/lowercaser to unit test

Moved regression test decoder/lowercaser to unit test


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a833bd65
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a833bd65
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a833bd65

Branch: refs/heads/master
Commit: a833bd651dde156a19f0414f7e6d380778daaad4
Parents: 74e6c00
Author: Michael A. Hedderich <mi...@users.noreply.github.com>
Authored: Wed Sep 14 17:43:59 2016 +0200
Committer: Michael A. Hedderich <mi...@users.noreply.github.com>
Committed: Wed Sep 14 17:43:59 2016 +0200

----------------------------------------------------------------------
 .../joshua/decoder/cky/LowercaseTest.java       | 115 +++++++++++++++
 .../resources/decoder/lowercaser/joshua.config  | 140 +++++++++++++++++++
 .../resources/decoder/lowercaser/output.gold    |   5 -
 src/test/resources/decoder/lowercaser/test.sh   |  40 ------
 4 files changed, 255 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a833bd65/src/test/java/org/apache/joshua/decoder/cky/LowercaseTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/decoder/cky/LowercaseTest.java b/src/test/java/org/apache/joshua/decoder/cky/LowercaseTest.java
new file mode 100644
index 0000000..e3f0aac
--- /dev/null
+++ b/src/test/java/org/apache/joshua/decoder/cky/LowercaseTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.cky;
+
+import static org.apache.joshua.decoder.cky.TestUtil.translate;
+import static org.testng.Assert.assertEquals;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+
+public class LowercaseTest {
+
+  private static final String INPUT_ALL_UPPERCASED = "ELLA";
+  private static final String INPUT_CAPITALIZED = "Ella";
+
+  private static final String GOLD_UNTRANSLATED_ALL_UPPERCASED = "ELLA";
+  private static final String GOLD_LOWERCASED = "she";
+  private static final String GOLD_CAPITALIZED = "She";
+  private static final String GOLD_ALL_UPPERCASED = "SHE";
+  
+  private static final String JOSHUA_CONFIG_PATH = "src/test/resources/decoder/lowercaser/joshua.config";
+
+  private JoshuaConfiguration joshuaConfig;
+  private Decoder decoder;
+
+  /**
+   * No match in phrase table (only contains ella), therefore passed through
+   * untranslated.
+   * @throws Exception 
+   */
+  @Test
+  public void givenAllUppercasedInput_whenNotLowercasing_thenLowercasedRuleNotFound() throws Exception {
+    setUp(false, false, false);
+    String output = translate(INPUT_ALL_UPPERCASED, decoder, joshuaConfig);
+    assertEquals(output.trim(), GOLD_UNTRANSLATED_ALL_UPPERCASED);
+  }
+  
+  /**
+   * Match in phrase table (only contains ella), therefore translated.
+   * @throws Exception
+   */
+  @Test
+  public void givenAllUppercasedInput_whenLowercasing_thenLowercasedRuleFound() throws Exception {
+    setUp(true, false, false);
+    String output = translate(INPUT_ALL_UPPERCASED, decoder, joshuaConfig);
+    assertEquals(output.trim(), GOLD_LOWERCASED);
+  }
+  
+  /**
+   * Matches phrase table, not capitalized because projected from first word of sentence
+   * @throws Exception
+   */
+  @Test
+  public void givenCapitalizedInput_whenLowercasingAndProjecting_thenLowercased() throws Exception {
+    setUp(true, true, false);
+    String output = translate(INPUT_CAPITALIZED, decoder, joshuaConfig);
+    assertEquals(output.trim(), GOLD_LOWERCASED);
+  }
+  
+  /**
+   * Matches phrase table, capitalized because of output-format
+   * @throws Exception
+   */
+  @Test
+  public void givenCapitalizedInput_whenLowercasingAndOutputFormatCapitalization_thenCapitalized() throws Exception {
+    setUp(true, true, true);
+    String output = translate(INPUT_CAPITALIZED, decoder, joshuaConfig);
+    assertEquals(output.trim(), GOLD_CAPITALIZED);
+  }
+  
+  /**
+   * Matches phrase table, capitalized because of output-format
+   * @throws Exception
+   */
+  @Test
+  public void givenAllUppercasedInput_whenLowercasingAndProjecting_thenAllUppercased() throws Exception {
+    setUp(true, true, false);
+    String output = translate(INPUT_ALL_UPPERCASED, decoder, joshuaConfig);
+    assertEquals(output.trim(), GOLD_ALL_UPPERCASED);
+  }
+
+  public void setUp(boolean lowercase, boolean projectCase, boolean capitalize) throws Exception {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.readConfigFile(JOSHUA_CONFIG_PATH);
+    joshuaConfig.lowercase = lowercase;
+    joshuaConfig.project_case = projectCase;
+    joshuaConfig.outputFormat = capitalize ? "%S" : "%s";
+    decoder = new Decoder(joshuaConfig, "");
+  }
+  
+  @AfterMethod
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+    decoder = null;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a833bd65/src/test/resources/decoder/lowercaser/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/lowercaser/joshua.config b/src/test/resources/decoder/lowercaser/joshua.config
new file mode 100644
index 0000000..6f5a46b
--- /dev/null
+++ b/src/test/resources/decoder/lowercaser/joshua.config
@@ -0,0 +1,140 @@
+# This file is a template for the Joshua pipeline; variables enclosed
+# in <angle-brackets> are substituted by the pipeline script as
+# appropriate.  This file also serves to document Joshua's many
+# parameters.
+
+# These are the grammar file specifications.  Joshua supports an
+# arbitrary number of grammar files, each specified on its own line
+# using the following format:
+#
+#   tm = TYPE OWNER LIMIT FILE
+# 
+# TYPE is "packed", "thrax", or "samt".  The latter denotes the format
+# used in Zollmann and Venugopal's SAMT decoder
+# (http://www.cs.cmu.edu/~zollmann/samt/).
+# 
+# OWNER is the "owner" of the rules in the grammar; this is used to
+# determine which set of phrasal features apply to the grammar's
+# rules.  Having different owners allows different features to be
+# applied to different grammars, and for grammars to share features
+# across files.
+#
+# LIMIT is the maximum input span permitted for the application of
+# grammar rules found in the grammar file.  A value of -1 implies no limit.
+#
+# FILE is the grammar file (or directory when using packed grammars).
+# The file can be compressed with gzip, which is determined by the
+# presence or absence of a ".gz" file extension.
+#
+# By a convention defined by Chiang (2007), the grammars are split
+# into two files: the main translation grammar containing all the
+# learned translation rules, and a glue grammar which supports
+# monotonic concatenation of hierarchical phrases. The glue grammar's
+# main distinction from the regular grammar is that the span limit
+# does not apply to it.  
+
+tm = hiero -maxspan 20 -path src/test/resources/decoder/lowercaser/grammar.test -owner pt
+tm = thrax -path src/test/resources/decoder/lowercaser/grammar.glue -maxspan -1 -owner glue
+
+# This symbol is used over unknown words in the source language
+
+default-non-terminal = X
+
+# This is the goal nonterminal, used to determine when a complete
+# parse is found.  It should correspond to the root-level rules in the
+# glue grammar.
+
+goal-symbol = GOAL
+
+# Language model config.
+#
+# Multiple language models are supported.  For each language model,
+# create one of the following lines:
+#
+# feature-function = LanguageModel -lm_type TYPE -lm_order ORDER -lm_file FILE
+# feature-function = StateMinimizingLanguageModel -lm_order ORDER -lm_file FILE
+#
+# - TYPE is one of "kenlm" or "berkeleylm"
+# - ORDER is the order of the language model (default 5)
+# - FILE is the path to the LM file. This can be binarized if appropriate to the type
+#   (e.g., KenLM has a compiled format)
+#
+# A state-minimizing LM collapses left-state. Currently only KenLM supports this.
+#
+# For each LM, add a weight lm_INDEX below, where indexing starts from 0.
+
+
+
+# The suffix _OOV is appended to unknown source-language words if this
+# is set to true.
+
+mark-oovs = false
+
+# The search algorithm: "cky" for hierarchical / phrase-based decoding, 
+# "stack" for phrase-based decoding
+search = cky
+
+# The pop-limit for decoding.  This determines how many hypotheses are
+# considered over each span of the input.
+
+pop-limit = 100
+
+# How many hypotheses to output
+
+top-n = 1
+
+# Whether those hypotheses should be distinct strings
+
+use-unique-nbest = true
+
+# This is the default format of the ouput printed to STDOUT.  The variables that can be
+# substituted are:
+#
+# %i: the sentence number (0-indexed)
+# %s: the translated sentence
+# %t: the derivation tree
+# %f: the feature string
+# %c: the model cost
+
+output-format = %s
+
+# When printing the trees (%t in 'output-format'), this controls whether the alignments
+# are also printed.
+
+include-align-index = false
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+
+## Model weights #####################################################
+
+# For each langage model line listed above, create a weight in the
+# following format: the keyword "lm", a 0-based index, and the weight.
+# lm_INDEX WEIGHT
+
+
+# The phrasal weights correspond to weights stored with each of the
+# grammar rules.  The format is
+#
+#   tm_OWNER_COLUMN WEIGHT
+#
+# where COLUMN denotes the 0-based order of the parameter in the
+# grammar file and WEIGHT is the corresponding weight.  In the future,
+# we plan to add a sparse feature representation which will simplify
+# this.
+
+# The wordpenalty feature counts the number of words in each hypothesis.
+
+
+# This feature counts the number of unknown words in the hypothesis.
+
+
+# This feature weights paths through an input lattice.  It is only activated
+# when decoding lattices.
+
+WordPenalty -4.72455379476569
+OOVPenalty 0.7897219562429866
+tm_pt_0 0.3137696816891433
+tm_glue_0 -0.04493059277470993
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a833bd65/src/test/resources/decoder/lowercaser/output.gold
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/lowercaser/output.gold b/src/test/resources/decoder/lowercaser/output.gold
deleted file mode 100644
index ea1d2bc..0000000
--- a/src/test/resources/decoder/lowercaser/output.gold
+++ /dev/null
@@ -1,5 +0,0 @@
-ELLA
-she
-she
-She
-SHE

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a833bd65/src/test/resources/decoder/lowercaser/test.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/lowercaser/test.sh b/src/test/resources/decoder/lowercaser/test.sh
deleted file mode 100755
index 875ae57..0000000
--- a/src/test/resources/decoder/lowercaser/test.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-(
-# no match to phrase table, outputs ELLA
-echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config
-# matches phrase table, outputs she
-echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase
-# matches phrase table, not capitalized because projected from first word of sentence, outputs she
-echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase -project-case
-# matches phrase table, capitalized because of output-format
-echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase -project-case -output-format %S
-# matches phrase table, projected case because all caps, outputs SHE
-echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase -project-case
-) > output 2> log
-
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-    rm -f log output diff
-    exit 0
-else
-    exit 1
-fi