You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:35 UTC

[14/18] incubator-joshua git commit: Added -lowercase option to enable source-side projection of case

Added -lowercase option to enable source-side projection of case

If you add -lowercase to Joshua, it will lowercase all input, adding an annotation to each token of the form

    lettercase = {lower, upper, all-upper}

Then, at output time, the source-side input case will be projected to the target side.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3f4fa992
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3f4fa992
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3f4fa992

Branch: refs/heads/morph
Commit: 3f4fa992803fd9a7ac6dc3c51d803b65fda9d83d
Parents: 4f2bec7
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:38:26 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:38:26 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/JoshuaConfiguration.java     |   7 +
 .../decoder/hypergraph/KBestExtractor.java      |  37 ++++-
 .../hypergraph/WordAlignmentExtractor.java      |   2 -
 .../decoder/hypergraph/WordAlignmentState.java  |   1 -
 src/joshua/decoder/segment_file/Sentence.java   |   8 +-
 src/joshua/decoder/segment_file/Token.java      |  26 +++-
 src/joshua/lattice/Lattice.java                 |  31 ++--
 src/joshua/util/FormatUtils.java                |  19 +++
 test/decoder/lowercaser/config                  | 140 +++++++++++++++++++
 test/decoder/lowercaser/grammar.glue            |   4 +
 test/decoder/lowercaser/grammar.test            |   1 +
 test/decoder/lowercaser/output.gold             |   3 +
 test/decoder/lowercaser/test.sh                 |  18 +++
 13 files changed, 271 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c61720c..6c8edf6 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -51,6 +51,10 @@ public class JoshuaConfiguration {
   // whether to construct a StructuredTranslation object for each request instead of 
   // printing to stdout. Used when the Decoder is used from Java directly.
   public Boolean use_structured_output = false;
+  
+  // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+  // original case
+  public boolean lowercase = false;
 
   // List of grammar files to read
   public ArrayList<String> tms = new ArrayList<String>();
@@ -638,6 +642,9 @@ public class JoshuaConfiguration {
           } else if (parameter.equals(normalize_key("cached-rules-size"))) {
               // Check source sentence
               cachedRuleSize = Integer.parseInt(fds[1]);
+          } else if (parameter.equals(normalize_key("lowercase"))) {
+            lowercase = true;
+            
           } else {
 
             if (parameter.equals(normalize_key("use-sent-specific-tm"))

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 42539cc..45b9ccb 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -42,6 +42,8 @@ import joshua.decoder.ff.state_maintenance.DPState;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.io.DeNormalize;
 import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+import joshua.util.FormatUtils;
 
 /**
  * This class implements lazy k-best extraction on a hyper-graph.
@@ -185,12 +187,12 @@ public class KBestExtractor {
           .replaceAll("-lsb-", "[")
           .replaceAll("-rsb-", "]")
           .replaceAll("-pipe-", "|");
-
+      
 
       outputString = joshuaConfiguration.outputFormat
           .replace("%k", Integer.toString(k))
-          .replace("%s", hypothesis)
-          .replace("%S", DeNormalize.processSingleLine(hypothesis))
+          .replace("%s", recapitalize(hypothesis, node))
+          .replace("%S", DeNormalize.processSingleLine(recapitalize(hypothesis, node)))
           .replace("%i", Integer.toString(sentence.id()))
           .replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
           .replace("%c", String.format("%.3f", derivationState.cost));
@@ -283,6 +285,35 @@ public class KBestExtractor {
     }
     return virtualNode;
   }
+  
+  private String recapitalize(String input, HGNode goalNode) {
+    WordAlignmentState alignment = ViterbiExtractor.buildViterbiAlignment(goalNode);
+
+    String[] tokens = input.split("\\s+");
+    
+    List<List<Integer>> points = alignment.toFinalList();
+    for (int i = 0; i < points.size(); i++) {
+      List<Integer> target = points.get(i);
+      for (int source: target) {
+        Token token = sentence.getTokens().get(source + 1); // skip <s>
+        String annotation = "";
+        if (token != null && token.getAnnotation("lettercase") != null)
+          annotation = token.getAnnotation("lettercase");
+        if (source != 0 && annotation.equals("upper"))
+          tokens[i] = FormatUtils.capitalize(tokens[i]);
+        else if (annotation.equals("all-upper"))
+          tokens[i] = tokens[i].toUpperCase();
+      }
+    }
+
+    String cap = new String();
+    for (int i = 0; i < tokens.length; i++) {
+      if (i > 0)
+        cap += " ";
+      cap += tokens[i];
+    }
+    return cap; 
+  }
 
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index 63619ee..8e0c2a6 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -2,8 +2,6 @@ package joshua.decoder.hypergraph;
 
 import java.util.Stack;
 
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java
index e3b9598..d47fa38 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentState.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -1,7 +1,6 @@
 package joshua.decoder.hypergraph;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.ListIterator;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index c1eeca8..b51d509 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -191,7 +191,7 @@ public class Sentence {
             for (int i = 0; i <= chars.length - width; i++) {
               int j = i + width;
               if (width != chars.length) {
-                Token token = new Token(word.substring(i, j));
+                Token token = new Token(word.substring(i, j), config);
                 if (vocabulary.contains(id)) {
                   nodes.get(i).addArc(nodes.get(j), 0.0f, token);
                   wordChart.set(i, j, true);
@@ -386,7 +386,7 @@ public class Sentence {
    */
   public Lattice<String> stringLattice() {
     assert isLinearChain();
-    return Lattice.createStringLatticeFromString(source());
+    return Lattice.createStringLatticeFromString(source(), config);
   }
 
   public List<ConstraintSpan> constraints() {
@@ -400,10 +400,10 @@ public class Sentence {
           System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
           System.exit(12);
         }
-        this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource());
+        this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
       } else
         this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
-            rawSource(), Vocabulary.STOP_SYM));
+            rawSource(), Vocabulary.STOP_SYM), config);
     }
     return this.sourceLattice;
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index 12e2b68..ebe9a43 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -23,6 +23,9 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.util.FormatUtils;
 
 /**
  * Stores the identity of a word and its annotations in a sentence.
@@ -36,6 +39,7 @@ public class Token {
   private int tokenID;
 
   private HashMap<String,String> annotations = null;
+  private JoshuaConfiguration joshuaConfiguration;
 
   /**
    * Constructor : Creates a Token object from a raw word
@@ -58,7 +62,9 @@ public class Token {
    * @param rawWord A word with annotation information (possibly)
    *  
    */
-  public Token(String rawWord) {
+  public Token(String rawWord, JoshuaConfiguration config) {
+    
+    this.joshuaConfiguration = config;
     
     annotations = new HashMap<String,String>();
     
@@ -89,9 +95,21 @@ public class Token {
         .replaceAll("\\]",  "-rsb-")
         .replaceAll("\\|",  "-pipe-");
 
+    if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+      if (FormatUtils.ISALLUPPERCASE(token))
+        annotations.put("lettercase", "all-upper");
+      else if (Character.isUpperCase(token.charAt(0)))
+        annotations.put("lettercase",  "upper");
+      else
+        annotations.put("lettercase",  "lower");
+      
+      Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase")));
+      token = token.toLowerCase(); 
+    }
+    
     tokenID = Vocabulary.id(token);
   }
-
+  
   /**
    * Returns the word ID (vocab ID) for this token
    * 
@@ -108,6 +126,10 @@ public class Token {
   public String getWordIdentity() {
     return token;
   }
+  
+  public String toString() {
+    return token;
+  }
 
   /**
    * Returns the annotationID (vocab ID)

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
index abe43b2..bf2bf87 100644
--- a/src/joshua/lattice/Lattice.java
+++ b/src/joshua/lattice/Lattice.java
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.segment_file.Token;
 import joshua.util.ChartSpan;
 
@@ -61,6 +62,8 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
   /** Logger for this class. */
   private static final Logger logger = Logger.getLogger(Lattice.class.getName());
+  
+  JoshuaConfiguration config = null;
 
   /**
    * Constructs a new lattice from an existing list of (connected) nodes.
@@ -70,13 +73,13 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * 
    * @param nodes A list of nodes which must be in topological order.
    */
-  public Lattice(List<Node<Value>> nodes) {
+  public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
     this.nodes = nodes;
 //    this.distances = calculateAllPairsShortestPath();
     this.latticeHasAmbiguity = true;
   }
 
-  public Lattice(List<Node<Value>> nodes, boolean isAmbiguous) {
+  public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
     // Node<Value> sink = new Node<Value>(nodes.size());
     // nodes.add(sink);
     this.nodes = nodes;
@@ -89,7 +92,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * 
    * @param linearChain a sequence of Value objects
    */
-  public Lattice(Value[] linearChain) {
+  public Lattice(Value[] linearChain, JoshuaConfiguration config) {
     this.latticeHasAmbiguity = false;
     this.nodes = new ArrayList<Node<Value>>();
 
@@ -140,17 +143,17 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * @param linearChain
    * @return Lattice representation of the linear chain.
    */
-  public static Lattice<Token> createTokenLatticeFromString(String source) {
+  public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
     String[] tokens = source.split("\\s+");
     Token[] integerSentence = new Token[tokens.length];
     for (int i = 0; i < tokens.length; i++) {
-      integerSentence[i] = new Token(tokens[i]);
+      integerSentence[i] = new Token(tokens[i], config);
     }
 
-    return new Lattice<Token>(integerSentence);
+    return new Lattice<Token>(integerSentence, config);
   }
 
-  public static Lattice<Token> createTokenLatticeFromPLF(String data) {
+  public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
     ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
     
     // This matches a sequence of tuples, which describe arcs leaving this node
@@ -211,7 +214,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
         String remainingArcs = arcMatcher.group(4);
 
-        Token arcToken = new Token(arcLabel);
+        Token arcToken = new Token(arcLabel, config);
         currentNode.addArc(destinationNode, arcWeight, arcToken);
 
         arcMatcher = arcPattern.matcher(remainingArcs);
@@ -225,16 +228,16 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
     /* Add <s> to the start of the lattice. */
     if (nodes.size() > 1 && nodes.get(1) != null) {
       Node<Token> firstNode = nodes.get(1);
-      startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM));
+      startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
     }
 
     /* Add </s> as a final state, connect it to the previous end-state */
     nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
     Node<Token> endNode = new Node<Token>(nodeID);
-    nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM));
+    nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
     nodes.add(endNode);
 
-    return new Lattice<Token>(nodes, latticeIsAmbiguous);
+    return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
   }
 
   /**
@@ -243,7 +246,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * @param data String representation of a lattice.
    * @return A lattice that corresponds to the given string.
    */
-  public static Lattice<String> createStringLatticeFromString(String data) {
+  public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
 
     Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
 
@@ -303,7 +306,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
     logger.fine(nodeList.toString());
 
-    return new Lattice<String>(nodeList);
+    return new Lattice<String>(nodeList, config);
   }
 
   /**
@@ -431,7 +434,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
     nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
     nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
 
-    Lattice<String> graph = new Lattice<String>(nodes);
+    Lattice<String> graph = new Lattice<String>(nodes, null);
 
     System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/util/FormatUtils.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java
index 3bd53e9..c196328 100644
--- a/src/joshua/util/FormatUtils.java
+++ b/src/joshua/util/FormatUtils.java
@@ -170,4 +170,23 @@ public class FormatUtils {
       return false;
     }
   }
+  
+  /**
+   * Determines if a string contains ALL CAPS
+   * 
+   * @param token
+   * @return true if the string is all in uppercase, false otherwise
+   */
+  public static boolean ISALLUPPERCASE(String token) {
+    for (int i = 0; i < token.length(); i++)
+      if (! Character.isUpperCase(token.charAt(i)))
+        return false;
+    return true;
+  }
+
+  public static String capitalize(String word) {
+    if (word == null || word.length() == 0)
+      return word;
+    return word.substring(0, 1).toUpperCase() + word.substring(1);
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/config
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/config b/test/decoder/lowercaser/config
new file mode 100644
index 0000000..efa787e
--- /dev/null
+++ b/test/decoder/lowercaser/config
@@ -0,0 +1,140 @@
+# This file is a template for the Joshua pipeline; variables enclosed
+# in <angle-brackets> are substituted by the pipeline script as
+# appropriate.  This file also serves to document Joshua's many
+# parameters.
+
+# These are the grammar file specifications.  Joshua supports an
+# arbitrary number of grammar files, each specified on its own line
+# using the following format:
+#
+#   tm = TYPE OWNER LIMIT FILE
+# 
+# TYPE is "packed", "thrax", or "samt".  The latter denotes the format
+# used in Zollmann and Venugopal's SAMT decoder
+# (http://www.cs.cmu.edu/~zollmann/samt/).
+# 
+# OWNER is the "owner" of the rules in the grammar; this is used to
+# determine which set of phrasal features apply to the grammar's
+# rules.  Having different owners allows different features to be
+# applied to different grammars, and for grammars to share features
+# across files.
+#
+# LIMIT is the maximum input span permitted for the application of
+# grammar rules found in the grammar file.  A value of -1 implies no limit.
+#
+# FILE is the grammar file (or directory when using packed grammars).
+# The file can be compressed with gzip, which is determined by the
+# presence or absence of a ".gz" file extension.
+#
+# By a convention defined by Chiang (2007), the grammars are split
+# into two files: the main translation grammar containing all the
+# learned translation rules, and a glue grammar which supports
+# monotonic concatenation of hierarchical phrases. The glue grammar's
+# main distinction from the regular grammar is that the span limit
+# does not apply to it.  
+
+tm = hiero -maxspan 20 -path grammar.test -owner pt
+tm = thrax -path grammar.glue -maxspan -1 -owner glue
+
+# This symbol is used over unknown words in the source language
+
+default-non-terminal = X
+
+# This is the goal nonterminal, used to determine when a complete
+# parse is found.  It should correspond to the root-level rules in the
+# glue grammar.
+
+goal-symbol = GOAL
+
+# Language model config.
+#
+# Multiple language models are supported.  For each language model,
+# create one of the following lines:
+#
+# feature-function = LanguageModel -lm_type TYPE -lm_order ORDER -lm_file FILE
+# feature-function = StateMinimizingLanguageModel -lm_order ORDER -lm_file FILE
+#
+# - TYPE is one of "kenlm" or "berkeleylm"
+# - ORDER is the order of the language model (default 5)
+# - FILE is the path to the LM file. This can be binarized if appropriate to the type
+#   (e.g., KenLM has a compiled format)
+#
+# A state-minimizing LM collapses left-state. Currently only KenLM supports this.
+#
+# For each LM, add a weight lm_INDEX below, where indexing starts from 0.
+
+
+
+# The suffix _OOV is appended to unknown source-language words if this
+# is set to true.
+
+mark-oovs = false
+
+# The search algorithm: "cky" for hierarchical / phrase-based decoding, 
+# "stack" for phrase-based decoding
+search = cky
+
+# The pop-limit for decoding.  This determines how many hypotheses are
+# considered over each span of the input.
+
+pop-limit = 100
+
+# How many hypotheses to output
+
+top-n = 1
+
+# Whether those hypotheses should be distinct strings
+
+use-unique-nbest = true
+
+# This is the default format of the ouput printed to STDOUT.  The variables that can be
+# substituted are:
+#
+# %i: the sentence number (0-indexed)
+# %s: the translated sentence
+# %t: the derivation tree
+# %f: the feature string
+# %c: the model cost
+
+output-format = %s
+
+# When printing the trees (%t in 'output-format'), this controls whether the alignments
+# are also printed.
+
+include-align-index = false
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+
+## Model weights #####################################################
+
+# For each langage model line listed above, create a weight in the
+# following format: the keyword "lm", a 0-based index, and the weight.
+# lm_INDEX WEIGHT
+
+
+# The phrasal weights correspond to weights stored with each of the
+# grammar rules.  The format is
+#
+#   tm_OWNER_COLUMN WEIGHT
+#
+# where COLUMN denotes the 0-based order of the parameter in the
+# grammar file and WEIGHT is the corresponding weight.  In the future,
+# we plan to add a sparse feature representation which will simplify
+# this.
+
+# The wordpenalty feature counts the number of words in each hypothesis.
+
+
+# This feature counts the number of unknown words in the hypothesis.
+
+
+# This feature weights paths through an input lattice.  It is only activated
+# when decoding lattices.
+
+WordPenalty -4.72455379476569
+OOVPenalty 0.7897219562429866
+tm_pt_0 0.3137696816891433
+tm_glue_0 -0.04493059277470993
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.glue
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.glue b/test/decoder/lowercaser/grammar.glue
new file mode 100644
index 0000000..69e1520
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.glue
@@ -0,0 +1,4 @@
+[GOAL] ||| <s> ||| <s> ||| 0
+[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.test
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.test b/test/decoder/lowercaser/grammar.test
new file mode 100644
index 0000000..3745008
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.test
@@ -0,0 +1 @@
+[X] ||| ella ||| she ||| 1 ||| 0-0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/output.gold b/test/decoder/lowercaser/output.gold
new file mode 100644
index 0000000..0c9c1eb
--- /dev/null
+++ b/test/decoder/lowercaser/output.gold
@@ -0,0 +1,3 @@
+ELLA
+she
+SHE

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/test.sh b/test/decoder/lowercaser/test.sh
new file mode 100755
index 0000000..4db1251
--- /dev/null
+++ b/test/decoder/lowercaser/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -u
+
+(
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config
+echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+) > output 2> .log
+
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+    rm -f log output diff
+    exit 0
+else
+    exit 1
+fi