You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 20:38:30 UTC
incubator-joshua git commit: Added -lowercase option to enable
source-side projection of case
Repository: incubator-joshua
Updated Branches:
refs/heads/master 4f2bec7c0 -> 3f4fa9928
Added -lowercase option to enable source-side projection of case
If you add -lowercase to Joshua, it will lowercase all input, adding an annotation to each token of the form
lettercase = {lower, upper, all-upper}
Then, at output time, the source-side input case will be projected to the target side.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3f4fa992
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3f4fa992
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3f4fa992
Branch: refs/heads/master
Commit: 3f4fa992803fd9a7ac6dc3c51d803b65fda9d83d
Parents: 4f2bec7
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:38:26 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:38:26 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/JoshuaConfiguration.java | 7 +
.../decoder/hypergraph/KBestExtractor.java | 37 ++++-
.../hypergraph/WordAlignmentExtractor.java | 2 -
.../decoder/hypergraph/WordAlignmentState.java | 1 -
src/joshua/decoder/segment_file/Sentence.java | 8 +-
src/joshua/decoder/segment_file/Token.java | 26 +++-
src/joshua/lattice/Lattice.java | 31 ++--
src/joshua/util/FormatUtils.java | 19 +++
test/decoder/lowercaser/config | 140 +++++++++++++++++++
test/decoder/lowercaser/grammar.glue | 4 +
test/decoder/lowercaser/grammar.test | 1 +
test/decoder/lowercaser/output.gold | 3 +
test/decoder/lowercaser/test.sh | 18 +++
13 files changed, 271 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c61720c..6c8edf6 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -51,6 +51,10 @@ public class JoshuaConfiguration {
// whether to construct a StructuredTranslation object for each request instead of
// printing to stdout. Used when the Decoder is used from Java directly.
public Boolean use_structured_output = false;
+
+ // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+ // original case
+ public boolean lowercase = false;
// List of grammar files to read
public ArrayList<String> tms = new ArrayList<String>();
@@ -638,6 +642,9 @@ public class JoshuaConfiguration {
} else if (parameter.equals(normalize_key("cached-rules-size"))) {
// Check source sentence
cachedRuleSize = Integer.parseInt(fds[1]);
+ } else if (parameter.equals(normalize_key("lowercase"))) {
+ lowercase = true;
+
} else {
if (parameter.equals(normalize_key("use-sent-specific-tm"))
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 42539cc..45b9ccb 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -42,6 +42,8 @@ import joshua.decoder.ff.state_maintenance.DPState;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.io.DeNormalize;
import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+import joshua.util.FormatUtils;
/**
* This class implements lazy k-best extraction on a hyper-graph.
@@ -185,12 +187,12 @@ public class KBestExtractor {
.replaceAll("-lsb-", "[")
.replaceAll("-rsb-", "]")
.replaceAll("-pipe-", "|");
-
+
outputString = joshuaConfiguration.outputFormat
.replace("%k", Integer.toString(k))
- .replace("%s", hypothesis)
- .replace("%S", DeNormalize.processSingleLine(hypothesis))
+ .replace("%s", recapitalize(hypothesis, node))
+ .replace("%S", DeNormalize.processSingleLine(recapitalize(hypothesis, node)))
.replace("%i", Integer.toString(sentence.id()))
.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
.replace("%c", String.format("%.3f", derivationState.cost));
@@ -283,6 +285,35 @@ public class KBestExtractor {
}
return virtualNode;
}
+
+ private String recapitalize(String input, HGNode goalNode) {
+ WordAlignmentState alignment = ViterbiExtractor.buildViterbiAlignment(goalNode);
+
+ String[] tokens = input.split("\\s+");
+
+ List<List<Integer>> points = alignment.toFinalList();
+ for (int i = 0; i < points.size(); i++) {
+ List<Integer> target = points.get(i);
+ for (int source: target) {
+ Token token = sentence.getTokens().get(source + 1); // skip <s>
+ String annotation = "";
+ if (token != null && token.getAnnotation("lettercase") != null)
+ annotation = token.getAnnotation("lettercase");
+ if (source != 0 && annotation.equals("upper"))
+ tokens[i] = FormatUtils.capitalize(tokens[i]);
+ else if (annotation.equals("all-upper"))
+ tokens[i] = tokens[i].toUpperCase();
+ }
+ }
+
+ String cap = new String();
+ for (int i = 0; i < tokens.length; i++) {
+ if (i > 0)
+ cap += " ";
+ cap += tokens[i];
+ }
+ return cap;
+ }
/**
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index 63619ee..8e0c2a6 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -2,8 +2,6 @@ package joshua.decoder.hypergraph;
import java.util.Stack;
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java
index e3b9598..d47fa38 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentState.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -1,7 +1,6 @@
package joshua.decoder.hypergraph;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index c1eeca8..b51d509 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -191,7 +191,7 @@ public class Sentence {
for (int i = 0; i <= chars.length - width; i++) {
int j = i + width;
if (width != chars.length) {
- Token token = new Token(word.substring(i, j));
+ Token token = new Token(word.substring(i, j), config);
if (vocabulary.contains(id)) {
nodes.get(i).addArc(nodes.get(j), 0.0f, token);
wordChart.set(i, j, true);
@@ -386,7 +386,7 @@ public class Sentence {
*/
public Lattice<String> stringLattice() {
assert isLinearChain();
- return Lattice.createStringLatticeFromString(source());
+ return Lattice.createStringLatticeFromString(source(), config);
}
public List<ConstraintSpan> constraints() {
@@ -400,10 +400,10 @@ public class Sentence {
System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
System.exit(12);
}
- this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource());
+ this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
} else
this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
- rawSource(), Vocabulary.STOP_SYM));
+ rawSource(), Vocabulary.STOP_SYM), config);
}
return this.sourceLattice;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index 12e2b68..ebe9a43 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -23,6 +23,9 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.util.FormatUtils;
/**
* Stores the identity of a word and its annotations in a sentence.
@@ -36,6 +39,7 @@ public class Token {
private int tokenID;
private HashMap<String,String> annotations = null;
+ private JoshuaConfiguration joshuaConfiguration;
/**
* Constructor : Creates a Token object from a raw word
@@ -58,7 +62,9 @@ public class Token {
* @param rawWord A word with annotation information (possibly)
*
*/
- public Token(String rawWord) {
+ public Token(String rawWord, JoshuaConfiguration config) {
+
+ this.joshuaConfiguration = config;
annotations = new HashMap<String,String>();
@@ -89,9 +95,21 @@ public class Token {
.replaceAll("\\]", "-rsb-")
.replaceAll("\\|", "-pipe-");
+ if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+ if (FormatUtils.ISALLUPPERCASE(token))
+ annotations.put("lettercase", "all-upper");
+ else if (Character.isUpperCase(token.charAt(0)))
+ annotations.put("lettercase", "upper");
+ else
+ annotations.put("lettercase", "lower");
+
+ Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase")));
+ token = token.toLowerCase();
+ }
+
tokenID = Vocabulary.id(token);
}
-
+
/**
* Returns the word ID (vocab ID) for this token
*
@@ -108,6 +126,10 @@ public class Token {
public String getWordIdentity() {
return token;
}
+
+ public String toString() {
+ return token;
+ }
/**
* Returns the annotationID (vocab ID)
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
index abe43b2..bf2bf87 100644
--- a/src/joshua/lattice/Lattice.java
+++ b/src/joshua/lattice/Lattice.java
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.segment_file.Token;
import joshua.util.ChartSpan;
@@ -61,6 +62,8 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
/** Logger for this class. */
private static final Logger logger = Logger.getLogger(Lattice.class.getName());
+
+ JoshuaConfiguration config = null;
/**
* Constructs a new lattice from an existing list of (connected) nodes.
@@ -70,13 +73,13 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
*
* @param nodes A list of nodes which must be in topological order.
*/
- public Lattice(List<Node<Value>> nodes) {
+ public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
this.nodes = nodes;
// this.distances = calculateAllPairsShortestPath();
this.latticeHasAmbiguity = true;
}
- public Lattice(List<Node<Value>> nodes, boolean isAmbiguous) {
+ public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
// Node<Value> sink = new Node<Value>(nodes.size());
// nodes.add(sink);
this.nodes = nodes;
@@ -89,7 +92,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
*
* @param linearChain a sequence of Value objects
*/
- public Lattice(Value[] linearChain) {
+ public Lattice(Value[] linearChain, JoshuaConfiguration config) {
this.latticeHasAmbiguity = false;
this.nodes = new ArrayList<Node<Value>>();
@@ -140,17 +143,17 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
* @param linearChain
* @return Lattice representation of the linear chain.
*/
- public static Lattice<Token> createTokenLatticeFromString(String source) {
+ public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
String[] tokens = source.split("\\s+");
Token[] integerSentence = new Token[tokens.length];
for (int i = 0; i < tokens.length; i++) {
- integerSentence[i] = new Token(tokens[i]);
+ integerSentence[i] = new Token(tokens[i], config);
}
- return new Lattice<Token>(integerSentence);
+ return new Lattice<Token>(integerSentence, config);
}
- public static Lattice<Token> createTokenLatticeFromPLF(String data) {
+ public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
// This matches a sequence of tuples, which describe arcs leaving this node
@@ -211,7 +214,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
String remainingArcs = arcMatcher.group(4);
- Token arcToken = new Token(arcLabel);
+ Token arcToken = new Token(arcLabel, config);
currentNode.addArc(destinationNode, arcWeight, arcToken);
arcMatcher = arcPattern.matcher(remainingArcs);
@@ -225,16 +228,16 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
/* Add <s> to the start of the lattice. */
if (nodes.size() > 1 && nodes.get(1) != null) {
Node<Token> firstNode = nodes.get(1);
- startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM));
+ startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
}
/* Add </s> as a final state, connect it to the previous end-state */
nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
Node<Token> endNode = new Node<Token>(nodeID);
- nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM));
+ nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
nodes.add(endNode);
- return new Lattice<Token>(nodes, latticeIsAmbiguous);
+ return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
}
/**
@@ -243,7 +246,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
* @param data String representation of a lattice.
* @return A lattice that corresponds to the given string.
*/
- public static Lattice<String> createStringLatticeFromString(String data) {
+ public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
@@ -303,7 +306,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
logger.fine(nodeList.toString());
- return new Lattice<String>(nodeList);
+ return new Lattice<String>(nodeList, config);
}
/**
@@ -431,7 +434,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
- Lattice<String> graph = new Lattice<String>(nodes);
+ Lattice<String> graph = new Lattice<String>(nodes, null);
System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/util/FormatUtils.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java
index 3bd53e9..c196328 100644
--- a/src/joshua/util/FormatUtils.java
+++ b/src/joshua/util/FormatUtils.java
@@ -170,4 +170,23 @@ public class FormatUtils {
return false;
}
}
+
+ /**
+ * Determines if a string contains ALL CAPS
+ *
+ * @param token
+ * @return true if the string is all in uppercase, false otherwise
+ */
+ public static boolean ISALLUPPERCASE(String token) {
+ for (int i = 0; i < token.length(); i++)
+ if (! Character.isUpperCase(token.charAt(i)))
+ return false;
+ return true;
+ }
+
+ public static String capitalize(String word) {
+ if (word == null || word.length() == 0)
+ return word;
+ return word.substring(0, 1).toUpperCase() + word.substring(1);
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/config
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/config b/test/decoder/lowercaser/config
new file mode 100644
index 0000000..efa787e
--- /dev/null
+++ b/test/decoder/lowercaser/config
@@ -0,0 +1,140 @@
+# This file is a template for the Joshua pipeline; variables enclosed
+# in <angle-brackets> are substituted by the pipeline script as
+# appropriate. This file also serves to document Joshua's many
+# parameters.
+
+# These are the grammar file specifications. Joshua supports an
+# arbitrary number of grammar files, each specified on its own line
+# using the following format:
+#
+# tm = TYPE OWNER LIMIT FILE
+#
+# TYPE is "packed", "thrax", or "samt". The latter denotes the format
+# used in Zollmann and Venugopal's SAMT decoder
+# (http://www.cs.cmu.edu/~zollmann/samt/).
+#
+# OWNER is the "owner" of the rules in the grammar; this is used to
+# determine which set of phrasal features apply to the grammar's
+# rules. Having different owners allows different features to be
+# applied to different grammars, and for grammars to share features
+# across files.
+#
+# LIMIT is the maximum input span permitted for the application of
+# grammar rules found in the grammar file. A value of -1 implies no limit.
+#
+# FILE is the grammar file (or directory when using packed grammars).
+# The file can be compressed with gzip, which is determined by the
+# presence or absence of a ".gz" file extension.
+#
+# By a convention defined by Chiang (2007), the grammars are split
+# into two files: the main translation grammar containing all the
+# learned translation rules, and a glue grammar which supports
+# monotonic concatenation of hierarchical phrases. The glue grammar's
+# main distinction from the regular grammar is that the span limit
+# does not apply to it.
+
+tm = hiero -maxspan 20 -path grammar.test -owner pt
+tm = thrax -path grammar.glue -maxspan -1 -owner glue
+
+# This symbol is used over unknown words in the source language
+
+default-non-terminal = X
+
+# This is the goal nonterminal, used to determine when a complete
+# parse is found. It should correspond to the root-level rules in the
+# glue grammar.
+
+goal-symbol = GOAL
+
+# Language model config.
+#
+# Multiple language models are supported. For each language model,
+# create one of the following lines:
+#
+# feature-function = LanguageModel -lm_type TYPE -lm_order ORDER -lm_file FILE
+# feature-function = StateMinimizingLanguageModel -lm_order ORDER -lm_file FILE
+#
+# - TYPE is one of "kenlm" or "berkeleylm"
+# - ORDER is the order of the language model (default 5)
+# - FILE is the path to the LM file. This can be binarized if appropriate to the type
+# (e.g., KenLM has a compiled format)
+#
+# A state-minimizing LM collapses left-state. Currently only KenLM supports this.
+#
+# For each LM, add a weight lm_INDEX below, where indexing starts from 0.
+
+
+
+# The suffix _OOV is appended to unknown source-language words if this
+# is set to true.
+
+mark-oovs = false
+
+# The search algorithm: "cky" for hierarchical / phrase-based decoding,
+# "stack" for phrase-based decoding
+search = cky
+
+# The pop-limit for decoding. This determines how many hypotheses are
+# considered over each span of the input.
+
+pop-limit = 100
+
+# How many hypotheses to output
+
+top-n = 1
+
+# Whether those hypotheses should be distinct strings
+
+use-unique-nbest = true
+
+# This is the default format of the ouput printed to STDOUT. The variables that can be
+# substituted are:
+#
+# %i: the sentence number (0-indexed)
+# %s: the translated sentence
+# %t: the derivation tree
+# %f: the feature string
+# %c: the model cost
+
+output-format = %s
+
+# When printing the trees (%t in 'output-format'), this controls whether the alignments
+# are also printed.
+
+include-align-index = false
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+
+## Model weights #####################################################
+
+# For each langage model line listed above, create a weight in the
+# following format: the keyword "lm", a 0-based index, and the weight.
+# lm_INDEX WEIGHT
+
+
+# The phrasal weights correspond to weights stored with each of the
+# grammar rules. The format is
+#
+# tm_OWNER_COLUMN WEIGHT
+#
+# where COLUMN denotes the 0-based order of the parameter in the
+# grammar file and WEIGHT is the corresponding weight. In the future,
+# we plan to add a sparse feature representation which will simplify
+# this.
+
+# The wordpenalty feature counts the number of words in each hypothesis.
+
+
+# This feature counts the number of unknown words in the hypothesis.
+
+
+# This feature weights paths through an input lattice. It is only activated
+# when decoding lattices.
+
+WordPenalty -4.72455379476569
+OOVPenalty 0.7897219562429866
+tm_pt_0 0.3137696816891433
+tm_glue_0 -0.04493059277470993
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.glue
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.glue b/test/decoder/lowercaser/grammar.glue
new file mode 100644
index 0000000..69e1520
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.glue
@@ -0,0 +1,4 @@
+[GOAL] ||| <s> ||| <s> ||| 0
+[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.test
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.test b/test/decoder/lowercaser/grammar.test
new file mode 100644
index 0000000..3745008
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.test
@@ -0,0 +1 @@
+[X] ||| ella ||| she ||| 1 ||| 0-0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/output.gold b/test/decoder/lowercaser/output.gold
new file mode 100644
index 0000000..0c9c1eb
--- /dev/null
+++ b/test/decoder/lowercaser/output.gold
@@ -0,0 +1,3 @@
+ELLA
+she
+SHE
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/test.sh b/test/decoder/lowercaser/test.sh
new file mode 100755
index 0000000..4db1251
--- /dev/null
+++ b/test/decoder/lowercaser/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -u
+
+(
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config
+echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+) > output 2> .log
+
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+ rm -f log output diff
+ exit 0
+else
+ exit 1
+fi