You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/31 19:39:21 UTC
[1/5] incubator-joshua git commit: Added Sparse lexical feature
function. Revised various other sparse feature functions to avoid String
formatting. Expensive feature functions now use an LRU cache to avoid
re-calculation of feature hashes for commonly u
Repository: incubator-joshua
Updated Branches:
refs/heads/JOSHUA-252 9e7026665 -> 8793c45d7
Added Sparse lexical feature function. Revised various other sparse feature functions to avoid String formatting. Expensive feature functions now use an LRU cache to avoid re-calculation of feature hashes for commonly used rules. Also cleaned up the feature string parsing a little bit.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/25a92cbc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/25a92cbc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/25a92cbc
Branch: refs/heads/JOSHUA-252
Commit: 25a92cbca7c3a11c1d99c3e71686aea9874e0133
Parents: fadc285
Author: Felix Hieber <fh...@amazon.com>
Authored: Sat Apr 30 09:35:10 2016 -0700
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 11:44:51 2016 +0200
----------------------------------------------------------------------
src/joshua/corpus/Vocabulary.java | 13 +-
src/joshua/decoder/Decoder.java | 29 ++--
src/joshua/decoder/JoshuaConfiguration.java | 10 +-
src/joshua/decoder/ff/LexicalFeatures.java | 131 +++++++++++++++++++
src/joshua/decoder/ff/OOVPenalty.java | 15 ++-
src/joshua/decoder/ff/RuleFF.java | 110 ++++++++++------
src/joshua/decoder/ff/RuleLength.java | 13 +-
src/joshua/decoder/ff/RuleShape.java | 66 +++++++---
src/joshua/decoder/ff/WordPenalty.java | 10 +-
.../lm/berkeley_lm/LMGrammarBerkeleyTest.java | 2 +-
.../system/MultithreadedTranslationTests.java | 2 +-
.../system/StructuredTranslationTest.java | 2 +-
12 files changed, 301 insertions(+), 102 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index 74f6a47..2193629 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -205,10 +205,17 @@ public class Vocabulary {
}
public static String getWords(int[] ids) {
- if (ids.length == 0) return "";
+ return getWords(ids, " ");
+ }
+
+ public static String getWords(int[] ids, final String separator) {
+ if (ids.length == 0) {
+ return "";
+ }
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < ids.length - 1; i++)
- sb.append(word(ids[i])).append(" ");
+ for (int i = 0; i < ids.length - 1; i++) {
+ sb.append(word(ids[i])).append(separator);
+ }
return sb.append(word(ids[ids.length - 1])).toString();
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 22ed8b9..97ac9aa 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -20,7 +20,7 @@ package joshua.decoder;
import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
-import java.io.BufferedWriter;
+import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
@@ -34,8 +34,6 @@ import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
-import com.google.common.base.Strings;
-
import joshua.corpus.Vocabulary;
import joshua.decoder.ff.FeatureVector;
import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
@@ -59,6 +57,8 @@ import joshua.util.FormatUtils;
import joshua.util.Regex;
import joshua.util.io.LineReader;
+import com.google.common.base.Strings;
+
/**
* This class handles decoder initialization and the complication introduced by multithreading.
*
@@ -914,7 +914,7 @@ public class Decoder {
* Feature functions are instantiated with a line of the form
*
* <pre>
- * feature_function = FEATURE OPTIONS
+ * FEATURE OPTIONS
* </pre>
*
* Weights for features are listed separately.
@@ -926,31 +926,26 @@ public class Decoder {
private void initializeFeatureFunctions() throws IOException {
for (String featureLine : joshuaConfiguration.features) {
- // feature-function = NAME args
+ // line starts with NAME, followed by args
// 1. create new class named NAME, pass it config, weights, and the args
- // Get rid of the leading crap.
- featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");
-
String fields[] = featureLine.split("\\s+");
String featureName = fields[0];
+
try {
+
Class<?> clas = getClass(featureName);
Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
String[].class, JoshuaConfiguration.class);
- this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
+ FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
+ Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
+ this.featureFunctions.add(feature);
+
} catch (Exception e) {
- e.printStackTrace();
- System.err.println("* FATAL: could not find a feature '" + featureName + "'");
- System.exit(1);
+ throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e);
}
}
- for (FeatureFunction feature : featureFunctions) {
- Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
-
- }
-
weights.registerDenseFeatures(featureFunctions);
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c874534..05197e5 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -390,21 +390,21 @@ public class JoshuaConfiguration {
*
* LMs are now loaded as general feature functions, so we transform that to either
*
- * feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+ * LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
*
* If the line were state minimizing:
*
* lm = kenlm 5 true false 100 lm.gz
*
- * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+ * StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
*/
String[] tokens = fds[1].split("\\s+");
if (tokens[2].equals("true"))
- features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+ features.add(String.format("StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
tokens[1], tokens[5]));
else
- features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+ features.add(String.format("LanguageModel -lm_type %s -lm_order %s -lm_file %s",
tokens[0], tokens[1], tokens[5]));
} else if (parameter.equals(normalize_key("tm"))) {
@@ -582,7 +582,7 @@ public class JoshuaConfiguration {
} else if (parameter.equals(normalize_key("feature-function"))) {
// add the feature to the list of features for later processing
- features.add("feature_function = " + fds[1]);
+ features.add(fds[1]);
} else if (parameter.equals(normalize_key("maxlen"))) {
// add the feature to the list of features for later processing
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/LexicalFeatures.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalFeatures.java b/src/joshua/decoder/ff/LexicalFeatures.java
new file mode 100644
index 0000000..128df87
--- /dev/null
+++ b/src/joshua/decoder/ff/LexicalFeatures.java
@@ -0,0 +1,131 @@
+package joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.chart_parser.SourcePath;
+import joshua.decoder.ff.state_maintenance.DPState;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.HGNode;
+import joshua.decoder.segment_file.Sentence;
+
+import com.google.common.cache.Cache;
+
+/**
+ * Lexical alignment features denoting alignments, deletions, and insertions.
+ */
+public class LexicalFeatures extends StatelessFF {
+
+ private final boolean useAlignments;
+ private final boolean useDeletions;
+ private final boolean useInsertions;
+
+ private static final String NAME = "LexicalFeatures";
+ // value to fire for features
+ private static final int VALUE = 1;
+ //whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final int owner;
+ // Strings separating words
+ private static final String SEPARATOR = "~";
+
+ private final Cache<Rule, List<String>> featureCache;
+
+ public LexicalFeatures(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
+ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+ owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+
+ useAlignments = parsedArgs.containsKey("alignments");
+ useDeletions = parsedArgs.containsKey("deletions");
+ useInsertions = parsedArgs.containsKey("insertions");
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (ownerRestriction && rule.getOwner() != owner) {
+ return null;
+ }
+
+ List<String> featureNames = featureCache.getIfPresent(rule);
+ if (featureNames == null) {
+ featureNames = getFeatures(rule);
+ featureCache.put(rule, featureNames);
+ }
+ for (String feature : featureNames) {
+ acc.add(feature, VALUE);
+ }
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature ids for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private List<String> getFeatures(final Rule rule) {
+ final List<String> result = new ArrayList<>();
+
+ byte[] alignments = rule.getAlignment();
+ if (alignments == null) {
+ return result;
+ }
+ int[] sourceWords = rule.getFrench();
+ int[] targetWords = rule.getEnglish();
+
+ // sourceAligned & targetAligned indicate whether an index is covered by alignments
+ boolean[] sourceAligned = new boolean[sourceWords.length];
+ boolean[] targetAligned = new boolean[targetWords.length];
+
+ // translations: aligned words
+ for (int i = 0; i < alignments.length; i+=2) {
+ byte sourceIndex = alignments[i];
+ byte targetIndex = alignments[i + 1];
+ sourceAligned[sourceIndex] = true;
+ targetAligned[targetIndex] = true;
+ if (useAlignments) {
+ result.add(
+ "T:" +
+ Vocabulary.word(sourceWords[sourceIndex]) +
+ SEPARATOR +
+ Vocabulary.word(targetWords[targetIndex]));
+ }
+ }
+
+ // deletions: unaligned source words
+ if (useDeletions) {
+ for (int i = 0; i < sourceAligned.length; i++) {
+ if (!sourceAligned[i] && !Vocabulary.nt(sourceWords[i])) {
+ result.add("D:" + Vocabulary.word(sourceWords[i]));
+ }
+ }
+ }
+
+ // insertions: unaligned target words
+ if (useInsertions) {
+ for (int i = 0; i < targetAligned.length; i++) {
+ if (useInsertions && !targetAligned[i] && !Vocabulary.nt(targetWords[i])) {
+ result.add("I:" + Vocabulary.word(targetWords[i]));
+ }
+ }
+ }
+
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/OOVPenalty.java b/src/joshua/decoder/ff/OOVPenalty.java
index 6a06548..47a83ef 100644
--- a/src/joshua/decoder/ff/OOVPenalty.java
+++ b/src/joshua/decoder/ff/OOVPenalty.java
@@ -42,11 +42,11 @@ import joshua.decoder.chart_parser.SourcePath;
* @author Matt Post <po...@cs.jhu.edu>
*/
public class OOVPenalty extends StatelessFF {
- private int ownerID = -1;
+ private final int ownerID;
/* The default value returned for OOVs. Can be overridden with -oov-list */
- private float defaultValue = -100f;
- private HashMap<Integer,Float> oovWeights = null;
+ private final float defaultValue = -100f;
+ private final HashMap<Integer,Float> oovWeights;
public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, "OOVPenalty", args, config);
@@ -54,16 +54,18 @@ public class OOVPenalty extends StatelessFF {
ownerID = Vocabulary.id("oov");
oovWeights = new HashMap<Integer,Float>();
- if (config.oovList != null)
- for (OOVItem item: config.oovList)
+ if (config.oovList != null) {
+ for (OOVItem item: config.oovList) {
oovWeights.put(Vocabulary.id(item.label), item.weight);
+ }
+ }
}
@Override
public ArrayList<String> reportDenseFeatures(int index) {
denseFeatureIndex = index;
- ArrayList<String> names = new ArrayList<String>();
+ ArrayList<String> names = new ArrayList<>(1);
names.add(name);
return names;
}
@@ -78,7 +80,6 @@ public class OOVPenalty extends StatelessFF {
Sentence sentence, Accumulator acc) {
if (rule != null && this.ownerID == rule.getOwner()) {
-// acc.add(name, getValue(rule.getLHS()));
acc.add(denseFeatureIndex, getValue(rule.getLHS()));
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleFF.java b/src/joshua/decoder/ff/RuleFF.java
index 9fb7d3e..48e4340 100644
--- a/src/joshua/decoder/ff/RuleFF.java
+++ b/src/joshua/decoder/ff/RuleFF.java
@@ -18,6 +18,9 @@
*/
package joshua.decoder.ff;
+import static com.google.common.cache.CacheBuilder.newBuilder;
+import static joshua.corpus.Vocabulary.getWords;
+
import java.util.List;
import joshua.corpus.Vocabulary;
@@ -28,61 +31,94 @@ import joshua.decoder.ff.tm.Rule;
import joshua.decoder.hypergraph.HGNode;
import joshua.decoder.segment_file.Sentence;
+import com.google.common.cache.Cache;
+
/**
- * This feature just counts rules that are used. You can restrict it with a number of flags:
- *
- * -owner OWNER
- * Only count rules owned by OWNER
- * -target|-source
- * Only count the target or source side (plus the LHS)
- *
- * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold.
+ * This feature fires for rule ids.
+ * Firing can be restricted to rules from a certain owner, and rule ids
+ * can be generated from source side and/or target side.
*/
public class RuleFF extends StatelessFF {
private enum Sides { SOURCE, TARGET, BOTH };
- private int owner = 0;
- private Sides sides = Sides.BOTH;
+ private static final String NAME = "RuleFF";
+ // value to fire for features
+ private static final int VALUE = 1;
+ // whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final int owner;
+ // what part of the rule should be extracted;
+ private final Sides sides;
+ // Strings separating words and rule sides
+ private static final String SEPARATOR = "~";
+ private static final String SIDES_SEPARATOR = "->";
+
+ private final Cache<Rule, String> featureCache;
public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "RuleFF", args, config);
+ super(weights, NAME, args, config);
+
+ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+ owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
- owner = Vocabulary.id(parsedArgs.get("owner"));
- if (parsedArgs.containsKey("source"))
- sides = Sides.SOURCE;
- else if (parsedArgs.containsKey("target"))
- sides = Sides.TARGET;
+ if (parsedArgs.containsKey("sides")) {
+ final String sideValue = parsedArgs.get("sides");
+ if (sideValue.equalsIgnoreCase("source")) {
+ sides = Sides.SOURCE;
+ } else if (sideValue.equalsIgnoreCase("target")) {
+ sides = Sides.TARGET;
+ } else if (sideValue.equalsIgnoreCase("both")){
+ sides = Sides.BOTH;
+ } else {
+ throw new RuntimeException("Unknown side value.");
+ }
+ } else {
+ sides = Sides.BOTH;
+ }
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
}
@Override
public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
Sentence sentence, Accumulator acc) {
-
- if (owner > 0 && rule.getOwner() == owner) {
- String ruleString = getRuleString(rule);
- acc.add(ruleString, 1);
+
+ if (ownerRestriction && rule.getOwner() != owner) {
+ return null;
}
+ String featureName = featureCache.getIfPresent(rule);
+ if (featureName == null) {
+ featureName = getRuleString(rule);
+ featureCache.put(rule, featureName);
+ }
+ acc.add(featureName, VALUE);
+
return null;
}
-
- private String getRuleString(Rule rule) {
- String ruleString = "";
- switch(sides) {
- case BOTH:
- ruleString = String.format("%s %s %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords(),
- rule.getEnglishWords());
- break;
-
- case SOURCE:
- ruleString = String.format("%s %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords());
- break;
-
- case TARGET:
- ruleString = String.format("%s %s", Vocabulary.word(rule.getLHS()), rule.getEnglishWords());
- break;
+
+ /**
+ * Obtains the feature id for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private String getRuleString(final Rule rule) {
+ final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+ .append(SIDES_SEPARATOR);
+ if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+ sb.append(getWords(rule.getFrench(), SEPARATOR));
+ }
+ sb.append(SIDES_SEPARATOR);
+ if (sides == Sides.TARGET || sides == Sides.BOTH) {
+ sb.append(getWords(rule.getEnglish(), SEPARATOR));
}
- return ruleString.replaceAll("[ =]", "~");
+ return sb.toString();
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleLength.java b/src/joshua/decoder/ff/RuleLength.java
index 645905a..ce02282 100644
--- a/src/joshua/decoder/ff/RuleLength.java
+++ b/src/joshua/decoder/ff/RuleLength.java
@@ -32,6 +32,8 @@ import joshua.decoder.segment_file.Sentence;
* source side, its target side, and a feature that pairs them.
*/
public class RuleLength extends StatelessFF {
+
+ private static final int VALUE = 1;
public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, "RuleLength", args, config);
@@ -40,12 +42,11 @@ public class RuleLength extends StatelessFF {
@Override
public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
Sentence sentence, Accumulator acc) {
- int sourceLen = rule.getFrench().length;
- int targetLen = rule.getEnglish().length;
- acc.add(String.format("%s_sourceLength%d", name, sourceLen), 1);
- acc.add(String.format("%s_targetLength%d", name, targetLen), 1);
- acc.add(String.format("%s_pairLength%d-%d", name, sourceLen, targetLen), 1);
-
+ int sourceLength = rule.getFrench().length;
+ int targetLength = rule.getEnglish().length;
+ acc.add(name + "_source" + sourceLength, VALUE);
+ acc.add(name + "_target" + sourceLength, VALUE);
+ acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
return null;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleShape.java b/src/joshua/decoder/ff/RuleShape.java
index e243528..3bd10a8 100644
--- a/src/joshua/decoder/ff/RuleShape.java
+++ b/src/joshua/decoder/ff/RuleShape.java
@@ -20,6 +20,7 @@ package joshua.decoder.ff;
import java.util.List;
+import joshua.corpus.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.chart_parser.SourcePath;
import joshua.decoder.ff.state_maintenance.DPState;
@@ -36,38 +37,63 @@ public class RuleShape extends StatelessFF {
super(weights, "RuleShape", args, config);
}
- private int gettype(int id) {
- if (id < 0)
- return -1;
- return 1;
+ private enum WordType {
+ N("N"), T("x"), P("+");
+ private final String string;
+ private boolean repeats;
+
+ private WordType(final String string) {
+ this.string = string;
+ this.repeats = false;
+ }
+
+ private void setRepeats() {
+ repeats = true;
+ }
+
+ @Override
+ public String toString() {
+ if (repeats) {
+ return this.string + "+";
+ }
+ return this.string;
+ }
+ }
+
+ private WordType getWordType(int id) {
+ if (Vocabulary.nt(id)) {
+ return WordType.N;
+ } else {
+ return WordType.T;
+ }
}
- private String pattern(int[] ids) {
- StringBuilder pattern = new StringBuilder();
- int curtype = gettype(ids[0]);
- int curcount = 1;
+ /**
+ * Returns a String describing the rule pattern.
+ */
+ private String getRulePattern(int[] ids) {
+ final StringBuilder pattern = new StringBuilder();
+ WordType currentType = getWordType(ids[0]);
for (int i = 1; i < ids.length; i++) {
- if (gettype(ids[i]) != curtype) {
- pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
- curtype = gettype(ids[i]);
- curcount = 1;
+ if (getWordType(ids[i]) != currentType) {
+ pattern.append(currentType.toString());
+ currentType = getWordType(ids[i]);
} else {
- curcount++;
+ currentType.setRepeats();
}
}
- pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
+ pattern.append(currentType.toString());
return pattern.toString();
}
@Override
public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
Sentence sentence, Accumulator acc) {
- String sourceShape = pattern(rule.getFrench());
- String targetShape = pattern(rule.getEnglish());
- acc.add(String.format("%s_source_%s", name, sourceShape), 1);
- acc.add(String.format("%s_target_%s", name, targetShape), 1);
- acc.add(String.format("%s_both_%s__%s", name, sourceShape, targetShape), 1);
-
+ final String sourceShape = getRulePattern(rule.getFrench());
+ final String targetShape = getRulePattern(rule.getEnglish());
+ acc.add(name + "_source_" + sourceShape, 1);
+ acc.add(name + "_target_" + sourceShape, 1);
+ acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
return null;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/WordPenalty.java b/src/joshua/decoder/ff/WordPenalty.java
index 583b59c..d72a4e6 100644
--- a/src/joshua/decoder/ff/WordPenalty.java
+++ b/src/joshua/decoder/ff/WordPenalty.java
@@ -37,12 +37,15 @@ import joshua.decoder.segment_file.Sentence;
public final class WordPenalty extends StatelessFF {
private float OMEGA = -(float) Math.log10(Math.E); // -0.435
+ private final boolean isCky;
public WordPenalty(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, "WordPenalty", args, config);
if (parsedArgs.containsKey("value"))
OMEGA = Float.parseFloat(parsedArgs.get("value"));
+
+ isCky = config.search_algorithm.equals("cky");
}
@Override
@@ -52,10 +55,9 @@ public final class WordPenalty extends StatelessFF {
if (rule != null) {
// TODO: this is an inefficient way to do this. Find a better way to not apply this rule
// to start and stop glue rules when phrase-based decoding.
- if (config.search_algorithm.equals("cky")
- || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE))
- // acc.add(name, OMEGA * (rule.getEnglish().length - rule.getArity()));
+ if (isCky || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE)) {
acc.add(denseFeatureIndex, OMEGA * (rule.getEnglish().length - rule.getArity()));
+ }
}
return null;
@@ -64,7 +66,7 @@ public final class WordPenalty extends StatelessFF {
@Override
public ArrayList<String> reportDenseFeatures(int index) {
denseFeatureIndex = index;
- ArrayList<String> names = new ArrayList<String>();
+ ArrayList<String> names = new ArrayList<>(1);
names.add(name);
return names;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index 6e0d90f..0a29646 100644
--- a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -49,7 +49,7 @@ public class LMGrammarBerkeleyTest {
public void verifyLM() {
joshuaConfig = new JoshuaConfiguration();
joshuaConfig.processCommandLineOptions(OPTIONS);
- joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+ joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
decoder = new Decoder(joshuaConfig, null);
String translation = decode(INPUT).toString();
assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java
index f438ccd..220bced 100644
--- a/tst/joshua/system/MultithreadedTranslationTests.java
+++ b/tst/joshua/system/MultithreadedTranslationTests.java
@@ -64,7 +64,7 @@ public class MultithreadedTranslationTests {
joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
joshuaConfig.goal_symbol = "[GOAL]";
joshuaConfig.default_non_terminal = "[X]";
- joshuaConfig.features.add("feature_function = OOVPenalty");
+ joshuaConfig.features.add("OOVPenalty");
joshuaConfig.weights.add("tm_pt_0 1");
joshuaConfig.weights.add("tm_pt_1 1");
joshuaConfig.weights.add("tm_pt_2 1");
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredTranslationTest.java b/tst/joshua/system/StructuredTranslationTest.java
index 0608a65..249eabf 100644
--- a/tst/joshua/system/StructuredTranslationTest.java
+++ b/tst/joshua/system/StructuredTranslationTest.java
@@ -85,7 +85,7 @@ public class StructuredTranslationTest {
joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
joshuaConfig.goal_symbol = "[GOAL]";
joshuaConfig.default_non_terminal = "[X]";
- joshuaConfig.features.add("feature_function = OOVPenalty");
+ joshuaConfig.features.add("OOVPenalty");
joshuaConfig.weights.add("tm_pt_0 1");
joshuaConfig.weights.add("tm_pt_1 1");
joshuaConfig.weights.add("tm_pt_2 1");
[2/5] incubator-joshua git commit: revert change to ivy.xml
Posted by mj...@apache.org.
revert change to ivy.xml
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5591c676
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5591c676
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5591c676
Branch: refs/heads/JOSHUA-252
Commit: 5591c6769c162e3268243aa3324c367c6ba9c945
Parents: 25a92cb
Author: Felix Hieber <fh...@amazon.com>
Authored: Mon May 30 11:54:53 2016 +0200
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 11:56:00 2016 +0200
----------------------------------------------------------------------
lib/ivy.xml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5591c676/lib/ivy.xml
----------------------------------------------------------------------
diff --git a/lib/ivy.xml b/lib/ivy.xml
index 66034c6..d41595d 100644
--- a/lib/ivy.xml
+++ b/lib/ivy.xml
@@ -1,11 +1,12 @@
<ivy-module version="2.0">
<info organisation="joshua" module="joshua"/>
<dependencies>
+ <dependency org="net.sourceforge.ant-doxygen" name="ant-doxygen" rev="1.6.1" />
<dependency org="net.sf.jung" name="jung-algorithms" rev="2.0"/>
<dependency org="net.sf.jung" name="jung-api" rev="2.0"/>
<dependency org="net.sf.jung" name="jung-graph-impl" rev="2.0"/>
<dependency org="net.sf.jung" name="jung-visualization" rev="2.0"/>
- <dependency org="org.apache.commons" name="commons-cli" rev="1.3.1"/>
+ <dependency org="org.apache.commons" name="commons-cli" rev="1.2"/>
<dependency org="org.testng" name="testng" rev="6.7"/>
<dependency org="junit" name="junit" rev="4.10" />
<dependency org="net.sourceforge.collections" name="collections-generic" rev="4.01"/>
[3/5] incubator-joshua git commit: Merge branch 'sparse' of
https://github.com/fhieber/incubator-joshua into JOSHUA-PR21
Posted by mj...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index bc6d67b,0000000..20f91ee
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@@ -1,100 -1,0 +1,135 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
++import static com.google.common.cache.CacheBuilder.newBuilder;
++
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
++import com.google.common.cache.Cache;
++
+/**
- * This feature just counts rules that are used. You can restrict it with a number of flags:
- *
- * -owner OWNER
- * Only count rules owned by OWNER
- * -target|-source
- * Only count the target or source side (plus the LHS)
- *
- * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold.
++ * This feature fires for rule ids.
++ * Firing can be restricted to rules from a certain owner, and rule ids
++ * can be generated from source side and/or target side.
+ */
+public class RuleFF extends StatelessFF {
+
+ private enum Sides { SOURCE, TARGET, BOTH };
+
- private int owner = 0;
- private Sides sides = Sides.BOTH;
++ private static final String NAME = "RuleFF";
++ // value to fire for features
++ private static final int VALUE = 1;
++ // whether this feature is restricted to a certain grammar/owner
++ private final boolean ownerRestriction;
++ // the grammar/owner this feature is restricted to fire
++ private final int owner;
++ // what part of the rule should be extracted;
++ private final Sides sides;
++ // Strings separating words and rule sides
++ private static final String SEPARATOR = "~";
++ private static final String SIDES_SEPARATOR = "->";
++
++ private final Cache<Rule, String> featureCache;
+
+ public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "RuleFF", args, config);
++ super(weights, NAME, args, config);
++
++ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
++ owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+
- owner = Vocabulary.id(parsedArgs.get("owner"));
- if (parsedArgs.containsKey("source"))
- sides = Sides.SOURCE;
- else if (parsedArgs.containsKey("target"))
- sides = Sides.TARGET;
++ if (parsedArgs.containsKey("sides")) {
++ final String sideValue = parsedArgs.get("sides");
++ if (sideValue.equalsIgnoreCase("source")) {
++ sides = Sides.SOURCE;
++ } else if (sideValue.equalsIgnoreCase("target")) {
++ sides = Sides.TARGET;
++ } else if (sideValue.equalsIgnoreCase("both")){
++ sides = Sides.BOTH;
++ } else {
++ throw new RuntimeException("Unknown side value.");
++ }
++ } else {
++ sides = Sides.BOTH;
++ }
++
++ // initialize cache
++ if (parsedArgs.containsKey("cacheSize")) {
++ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
++ } else {
++ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
++ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
-
- if (owner > 0 && rule.getOwner() == owner) {
- String ruleString = getRuleString(rule);
- acc.add(ruleString, 1);
++
++ if (ownerRestriction && rule.getOwner() != owner) {
++ return null;
+ }
+
++ String featureName = featureCache.getIfPresent(rule);
++ if (featureName == null) {
++ featureName = getRuleString(rule);
++ featureCache.put(rule, featureName);
++ }
++ acc.add(featureName, VALUE);
++
+ return null;
+ }
-
- private String getRuleString(Rule rule) {
- String ruleString = "";
- switch(sides) {
- case BOTH:
- ruleString = String.format("%s %s %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords(),
- rule.getEnglishWords());
- break;
-
- case SOURCE:
- ruleString = String.format("%s %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords());
- break;
-
- case TARGET:
- ruleString = String.format("%s %s", Vocabulary.word(rule.getLHS()), rule.getEnglishWords());
- break;
++
++ /**
++ * Obtains the feature id for the given rule.
++ * @param rule
++ * @return String representing the feature name.s
++ */
++ private String getRuleString(final Rule rule) {
++ final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
++ .append(SIDES_SEPARATOR);
++ if (sides == Sides.SOURCE || sides == Sides.BOTH) {
++ sb.append(Vocabulary.getWords(rule.getFrench(), SEPARATOR));
++ }
++ sb.append(SIDES_SEPARATOR);
++ if (sides == Sides.TARGET || sides == Sides.BOTH) {
++ sb.append(Vocabulary.getWords(rule.getEnglish(), SEPARATOR));
+ }
- return ruleString.replaceAll("[ =]", "~");
++ return sb.toString();
+ }
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
index 59b1c20,0000000..02c520b
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
@@@ -1,51 -1,0 +1,52 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/*
+ * This feature computes three feature templates: a feature indicating the length of the rule's
+ * source side, its target side, and a feature that pairs them.
+ */
+public abstract class RuleLength extends StatelessFF {
++
++ private static final int VALUE = 1;
+
+ public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleLength", args, config);
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
- int sourceLen = rule.getFrench().length;
- int targetLen = rule.getEnglish().length;
- acc.add(String.format("%s_sourceLength%d", name, sourceLen), 1);
- acc.add(String.format("%s_targetLength%d", name, targetLen), 1);
- acc.add(String.format("%s_pairLength%d-%d", name, sourceLen, targetLen), 1);
-
++ int sourceLength = rule.getFrench().length;
++ int targetLength = rule.getEnglish().length;
++ acc.add(name + "_source" + sourceLength, VALUE);
++ acc.add(name + "_target" + sourceLength, VALUE);
++ acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
index a514021,0000000..6333701
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
@@@ -1,85 -1,0 +1,112 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
++import org.apache.joshua.util.FormatUtils;
++import org.apache.joshua.corpus.Vocabulary;
+
+/*
+ * Implements the RuleShape feature for source, target, and paired source+target sides.
+ */
+public class RuleShape extends StatelessFF {
+
+ public RuleShape(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleShape", args, config);
+ }
+
- private int gettype(int id) {
- if (id < 0)
- return -1;
- return 1;
++ private enum WordType {
++ N("N"), T("x"), P("+");
++ private final String string;
++ private boolean repeats;
++
++ private WordType(final String string) {
++ this.string = string;
++ this.repeats = false;
++ }
++
++ private void setRepeats() {
++ repeats = true;
++ }
++
++ @Override
++ public String toString() {
++ if (repeats) {
++ return this.string + "+";
++ }
++ return this.string;
++ }
++ }
++
++ private WordType getWordType(int id) {
++ if (FormatUtils.isNonterminal(id)) {
++ return WordType.N;
++ } else {
++ return WordType.T;
++ }
+ }
+
- private String pattern(int[] ids) {
- StringBuilder pattern = new StringBuilder();
- int curtype = gettype(ids[0]);
- int curcount = 1;
++ /**
++ * Returns a String describing the rule pattern.
++ */
++ private String getRulePattern(int[] ids) {
++ final StringBuilder pattern = new StringBuilder();
++ WordType currentType = getWordType(ids[0]);
+ for (int i = 1; i < ids.length; i++) {
- if (gettype(ids[i]) != curtype) {
- pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
- curtype = gettype(ids[i]);
- curcount = 1;
++ if (getWordType(ids[i]) != currentType) {
++ pattern.append(currentType.toString());
++ currentType = getWordType(ids[i]);
+ } else {
- curcount++;
++ currentType.setRepeats();
+ }
+ }
- pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
++ pattern.append(currentType.toString());
+ return pattern.toString();
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
- String sourceShape = pattern(rule.getFrench());
- String targetShape = pattern(rule.getEnglish());
- acc.add(String.format("%s_source_%s", name, sourceShape), 1);
- acc.add(String.format("%s_target_%s", name, targetShape), 1);
- acc.add(String.format("%s_both_%s__%s", name, sourceShape, targetShape), 1);
-
++ final String sourceShape = getRulePattern(rule.getFrench());
++ final String targetShape = getRulePattern(rule.getEnglish());
++ acc.add(name + "_source_" + sourceShape, 1);
++ acc.add(name + "_target_" + sourceShape, 1);
++ acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
+ return null;
+ }
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
index 62c889f,0000000..e1f74c2
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
@@@ -1,90 -1,0 +1,92 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ *
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public final class WordPenalty extends StatelessFF {
+
+ private float OMEGA = -(float) Math.log10(Math.E); // -0.435
++ private final boolean isCky;
+
+ public WordPenalty(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "WordPenalty", args, config);
+
+ if (parsedArgs.containsKey("value"))
+ OMEGA = Float.parseFloat(parsedArgs.get("value"));
++
++ isCky = config.search_algorithm.equals("cky");
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null) {
+ // TODO: this is an inefficient way to do this. Find a better way to not apply this rule
+ // to start and stop glue rules when phrase-based decoding.
- if (config.search_algorithm.equals("cky")
- || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE))
- // acc.add(name, OMEGA * (rule.getEnglish().length - rule.getArity()));
++ if (isCky || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE)) {
+ acc.add(denseFeatureIndex, OMEGA * (rule.getEnglish().length - rule.getArity()));
++ }
+ }
+
+ return null;
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
- ArrayList<String> names = new ArrayList<String>();
++ ArrayList<String> names = new ArrayList<>(1);
+ names.add(name);
+ return names;
+ }
+
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null)
+ return weights.getDense(denseFeatureIndex) * OMEGA * (rule.getEnglish().length - rule.getArity());
+ return 0.0f;
+ }
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index df73136,0000000..00a6a36
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@@ -1,79 -1,0 +1,79 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.berkeley_lm;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * Replacement for test/lm/berkeley/test.sh regression test
+ */
+@RunWith(value = Parameterized.class)
+public class LMGrammarBerkeleyTest {
+
+ private static final String INPUT = "the chat-rooms";
+ private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");
+
+ private JoshuaConfiguration joshuaConfig;
+ private Decoder decoder;
+
+ @Parameters
+ public static List<String> lmFiles() {
+ return Arrays.asList("resources/berkeley_lm/lm",
+ "resources/berkeley_lm/lm.gz",
+ "resources/berkeley_lm/lm.berkeleylm",
+ "resources/berkeley_lm/lm.berkeleylm.gz");
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ }
+
+ //TODO @Parameters
+ public String lmFile;
+
+ @Test
+ public void verifyLM() {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.processCommandLineOptions(OPTIONS);
- joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
++ joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+ decoder = new Decoder(joshuaConfig, null);
+ String translation = decode(INPUT).toString();
+ assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
index f006363,0000000..c760586
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
+++ b/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
@@@ -1,164 -1,0 +1,164 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.system;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.MetaDataException;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Integration test for multithreaded Joshua decoder tests. Grammar used is a
+ * toy packed grammar.
+ *
+ * @author kellens
+ */
+public class MultithreadedTranslationTests {
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+ private static final String INPUT = "A K B1 U Z1 Z2 B2 C";
+ private int previousLogLevel;
+ private final static long NANO_SECONDS_PER_SECOND = 1_000_000_000;
+
+ @Before
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.search_algorithm = "cky";
+ joshuaConfig.mark_oovs = false;
+ joshuaConfig.pop_limit = 100;
+ joshuaConfig.use_unique_nbest = false;
+ joshuaConfig.include_align_index = false;
+ joshuaConfig.topN = 0;
+ joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar.packed");
+ joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
+ joshuaConfig.goal_symbol = "[GOAL]";
+ joshuaConfig.default_non_terminal = "[X]";
- joshuaConfig.features.add("feature_function = OOVPenalty");
++ joshuaConfig.features.add("OOVPenalty");
+ joshuaConfig.weights.add("tm_pt_0 1");
+ joshuaConfig.weights.add("tm_pt_1 1");
+ joshuaConfig.weights.add("tm_pt_2 1");
+ joshuaConfig.weights.add("tm_pt_3 1");
+ joshuaConfig.weights.add("tm_pt_4 1");
+ joshuaConfig.weights.add("tm_pt_5 1");
+ joshuaConfig.weights.add("tm_glue_0 1");
+ joshuaConfig.weights.add("OOVPenalty 2");
+ joshuaConfig.num_parallel_decoders = 500; // This will enable 500 parallel
+ // decoders to run at once.
+ // Useful to help flush out
+ // concurrency errors in
+ // underlying
+ // data-structures.
+ this.decoder = new Decoder(joshuaConfig, ""); // Second argument
+ // (configFile)
+ // is not even used by the
+ // constructor/initialize.
+
+ previousLogLevel = Decoder.VERBOSE;
+ Decoder.VERBOSE = 0;
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ this.decoder.cleanUp();
+ this.decoder = null;
+ Decoder.VERBOSE = previousLogLevel;
+ }
+
+
+
+ // This test was created specifically to reproduce a multithreaded issue
+ // related to mapped byte array access in the PackedGrammer getAlignmentArray
+ // function.
+
+ // We'll test the decoding engine using N = 10,000 identical inputs. This
+ // should be sufficient to induce concurrent data access for many shared
+ // data structures.
+
+ @Test
+ public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNResults() {
+ // GIVEN
+
+ int inputLines = 10000;
+ joshuaConfig.use_structured_output = true; // Enabled alignments.
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < inputLines; i++) {
+ sb.append(INPUT + "\n");
+ }
+
+ // Append a large string together to simulate N requests to the decoding
+ // engine.
+ TranslationRequestStream req = new TranslationRequestStream(
+ new BufferedReader(new InputStreamReader(new ByteArrayInputStream(sb.toString()
+ .getBytes(Charset.forName("UTF-8"))))), joshuaConfig);
+
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+
+ // WHEN
+ // Translate all spans in parallel.
+ try {
+ this.decoder.decodeAll(req, output);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ ArrayList<Sentence> translationResults = new ArrayList<Sentence>();
+
+
+ final long translationStartTime = System.nanoTime();
+ Sentence t;
+ try {
+ while ((t = req.next()) != null) {
+ translationResults.add(t);
+ }
+ } catch (MetaDataException e) {
+ e.printStackTrace();
+ } finally {
+ if (output != null) {
+ try {
+ output.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ final long translationEndTime = System.nanoTime();
+ final double pipelineLoadDurationInSeconds = (translationEndTime - translationStartTime) / ((double)NANO_SECONDS_PER_SECOND);
+ System.err.println(String.format("%.2f seconds", pipelineLoadDurationInSeconds));
+
+ // THEN
+ assertTrue(translationResults.size() == inputLines);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
index a78a4a1,0000000..69412e2
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
+++ b/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
@@@ -1,272 -1,0 +1,272 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.system;
+
+import static java.util.Arrays.asList;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.StructuredTranslation;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Integration test for the complete Joshua decoder using a toy grammar that translates
+ * a bunch of capital letters to lowercase letters. Rules in the test grammar
+ * drop and generate additional words and simulate reordering of rules, so that
+ * proper extraction of word alignments and other information from the decoder
+ * can be tested.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslationTest {
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+ private static final String INPUT = "A K B1 U Z1 Z2 B2 C";
+ private static final String EXPECTED_TRANSLATION = "a b n1 u z c1 k1 k2 k3 n1 n2 n3 c2";
+ private static final List<String> EXPECTED_TRANSLATED_TOKENS = asList(EXPECTED_TRANSLATION.split("\\s+"));
+ private static final String EXPECTED_WORD_ALIGNMENT_STRING = "0-0 2-1 6-1 3-3 4-4 5-4 7-5 1-6 1-7 1-8 7-12";
+ private static final List<List<Integer>> EXPECTED_WORD_ALIGNMENT = asList(
+ asList(0), asList(2, 6), asList(), asList(3),
+ asList(4, 5), asList(7), asList(1),
+ asList(1), asList(1), asList(), asList(),
+ asList(), asList(7));
+ private static final double EXPECTED_SCORE = -17.0;
+ private static final Map<String,Float> EXPECTED_FEATURES = new HashMap<>();
+ private static final int EXPECTED_NBEST_LIST_SIZE = 8;
+ static {
+ EXPECTED_FEATURES.put("tm_glue_0", 1.0f);
+ EXPECTED_FEATURES.put("tm_pt_0", -3.0f);
+ EXPECTED_FEATURES.put("tm_pt_1", -3.0f);
+ EXPECTED_FEATURES.put("tm_pt_2", -3.0f);
+ EXPECTED_FEATURES.put("tm_pt_3", -3.0f);
+ EXPECTED_FEATURES.put("tm_pt_4", -3.0f);
+ EXPECTED_FEATURES.put("tm_pt_5", -3.0f);
+ EXPECTED_FEATURES.put("OOV", 7.0f);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.search_algorithm = "cky";
+ joshuaConfig.mark_oovs = false;
+ joshuaConfig.pop_limit = 100;
+ joshuaConfig.use_unique_nbest = false;
+ joshuaConfig.include_align_index = false;
+ joshuaConfig.topN = 0;
+ joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar");
+ joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
+ joshuaConfig.goal_symbol = "[GOAL]";
+ joshuaConfig.default_non_terminal = "[X]";
- joshuaConfig.features.add("feature_function = OOVPenalty");
++ joshuaConfig.features.add("OOVPenalty");
+ joshuaConfig.weights.add("tm_pt_0 1");
+ joshuaConfig.weights.add("tm_pt_1 1");
+ joshuaConfig.weights.add("tm_pt_2 1");
+ joshuaConfig.weights.add("tm_pt_3 1");
+ joshuaConfig.weights.add("tm_pt_4 1");
+ joshuaConfig.weights.add("tm_pt_5 1");
+ joshuaConfig.weights.add("tm_glue_0 1");
+ joshuaConfig.weights.add("OOVPenalty 1");
+ decoder = new Decoder(joshuaConfig, ""); // second argument (configFile
+ // is not even used by the
+ // constructor/initialize)
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ }
+
+ private Translation decode(String input) {
+ Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+
+ @Test
+ public void givenInput_whenRegularOutputFormat_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = false;
+ joshuaConfig.outputFormat = "%s | %a ";
+
+ // WHEN
+ final String translation = decode(INPUT).toString().trim();
+
+ // THEN
+ assertEquals(EXPECTED_TRANSLATION + " | " + EXPECTED_WORD_ALIGNMENT_STRING, translation);
+ }
+
+ @Test
+ public void givenInput_whenRegularOutputFormatWithTopN1_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = false;
+ joshuaConfig.outputFormat = "%s | %e | %a | %c";
+ joshuaConfig.topN = 1;
+
+ // WHEN
+ final String translation = decode(INPUT).toString().trim();
+
+ // THEN
+ assertEquals(EXPECTED_TRANSLATION + " | " + INPUT + " | " + EXPECTED_WORD_ALIGNMENT_STRING + String.format(" | %.3f", EXPECTED_SCORE),
+ translation);
+ }
+
+ @Test
+ public void givenInput_whenStructuredOutputFormatWithTopN0_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 0;
+
+ // WHEN
+ final Translation translation = decode(INPUT);
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+ final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
+
+ // THEN
+ assertTrue(translation.getStructuredTranslations().size() == 1);
+ assertEquals(EXPECTED_TRANSLATION, translationString);
+ assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
+ assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
+ assertEquals(EXPECTED_WORD_ALIGNMENT, wordAlignment);
+ assertEquals(wordAlignment.size(), translatedTokens.size());
+ assertEquals(EXPECTED_FEATURES.entrySet(), translationFeatures.entrySet());
+ }
+
+ @Test
+ public void givenInput_whenStructuredOutputFormatWithTopN1_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 1;
+
+ // WHEN
+ final Translation translation = decode(INPUT);
+ final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+ final StructuredTranslation structuredTranslation = structuredTranslations.get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+ final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
+
+ // THEN
+ assertTrue(structuredTranslations.size() == 1);
+ assertEquals(EXPECTED_TRANSLATION, translationString);
+ assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
+ assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
+ assertEquals(EXPECTED_WORD_ALIGNMENT, wordAlignment);
+ assertEquals(wordAlignment.size(), translatedTokens.size());
+ assertEquals(EXPECTED_FEATURES.entrySet(), translationFeatures.entrySet());
+ }
+
+ @Test
+ public void givenInput_whenStructuredOutputFormatWithKBest_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 100;
+
+ // WHEN
+ final Translation translation = decode(INPUT);
+ final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+ final StructuredTranslation viterbiTranslation = structuredTranslations.get(0);
+ final StructuredTranslation lastKBest = structuredTranslations.get(structuredTranslations.size() - 1);
+
+ // THEN
+ assertEquals(structuredTranslations.size(), EXPECTED_NBEST_LIST_SIZE);
+ assertTrue(structuredTranslations.size() > 1);
+ assertEquals(EXPECTED_TRANSLATION, viterbiTranslation.getTranslationString());
+ assertEquals(EXPECTED_TRANSLATED_TOKENS, viterbiTranslation.getTranslationTokens());
+ assertEquals(EXPECTED_SCORE, viterbiTranslation.getTranslationScore(), 0.00001);
+ assertEquals(EXPECTED_WORD_ALIGNMENT, viterbiTranslation.getTranslationWordAlignments());
+ assertEquals(EXPECTED_FEATURES.entrySet(), viterbiTranslation.getTranslationFeatures().entrySet());
+ // last entry in KBEST is all input words untranslated, should have 8 OOVs.
+ assertEquals(INPUT, lastKBest.getTranslationString());
+ assertEquals(-800.0, lastKBest.getTranslationFeatures().get("OOVPenalty"), 0.0001);
+
+ }
+
+ @Test
+ public void givenEmptyInput_whenStructuredOutputFormat_thenEmptyOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+
+ // WHEN
+ final Translation translation = decode("");
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+
+ // THEN
+ assertEquals("", translationString);
+ assertTrue(translatedTokens.isEmpty());
+ assertEquals(0, translationScore, 0.00001);
+ assertTrue(wordAlignment.isEmpty());
+ }
+
+ @Test
+ public void givenOOVInput_whenStructuredOutputFormat_thenOOVOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ final String input = "gabarbl";
+
+ // WHEN
+ final Translation translation = decode(input);
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+
+ // THEN
+ assertEquals(input, translationString);
+ assertTrue(translatedTokens.contains(input));
+ assertEquals(-99.0, translationScore, 0.00001);
+ assertTrue(wordAlignment.contains(asList(0)));
+ }
+
+ @Test
+ public void givenEmptyInput_whenRegularOutputFormat_thenNewlineOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = false;
+
+ // WHEN
+ final Translation translation = decode("");
+ final String translationString = translation.toString();
+
+ // THEN
+ assertEquals("\n", translationString);
+ }
+
+}
[4/5] incubator-joshua git commit: Merge branch 'sparse' of
https://github.com/fhieber/incubator-joshua into JOSHUA-PR21
Posted by mj...@apache.org.
Merge branch 'sparse' of https://github.com/fhieber/incubator-joshua into JOSHUA-PR21
# Conflicts:
# lib/ivy.xml
# src/main/java/org/apache/joshua/decoder/Decoder.java
# src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
# src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5c0d5388
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5c0d5388
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5c0d5388
Branch: refs/heads/JOSHUA-252
Commit: 5c0d5388ae7a76538337bf89bd6ac9a04d2c6dff
Parents: 9e70266 5591c67
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 15:39:04 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 15:39:04 2016 -0400
----------------------------------------------------------------------
lib/ivy.xml | 17 +++
src/joshua/decoder/ff/LexicalFeatures.java | 131 +++++++++++++++++++
.../org/apache/joshua/corpus/Vocabulary.java | 13 +-
.../java/org/apache/joshua/decoder/Decoder.java | 17 ++-
.../joshua/decoder/JoshuaConfiguration.java | 10 +-
.../apache/joshua/decoder/ff/OOVPenalty.java | 15 ++-
.../org/apache/joshua/decoder/ff/RuleFF.java | 109 +++++++++------
.../apache/joshua/decoder/ff/RuleLength.java | 13 +-
.../org/apache/joshua/decoder/ff/RuleShape.java | 67 +++++++---
.../apache/joshua/decoder/ff/WordPenalty.java | 10 +-
.../lm/berkeley_lm/LMGrammarBerkeleyTest.java | 2 +-
.../system/MultithreadedTranslationTests.java | 2 +-
.../system/StructuredTranslationTest.java | 2 +-
13 files changed, 314 insertions(+), 94 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/Vocabulary.java
index 8416e4a,0000000..f1bf53d
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/Vocabulary.java
+++ b/src/main/java/org/apache/joshua/corpus/Vocabulary.java
@@@ -1,295 -1,0 +1,302 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.Externalizable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.locks.StampedLock;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.ff.lm.NGramLanguageModel;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Static singular vocabulary class.
+ * Supports (de-)serialization into a vocabulary file.
+ *
+ * @author Juri Ganitkevitch
+ */
+
+public class Vocabulary implements Externalizable {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Vocabulary.class);
+ private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<>();
+
+ private static List<String> idToString;
+ private static Map<String, Integer> stringToId;
+ private static final StampedLock lock = new StampedLock();
+
+ static final int UNKNOWN_ID = 0;
+ static final String UNKNOWN_WORD = "<unk>";
+
+ public static final String START_SYM = "<s>";
+ public static final String STOP_SYM = "</s>";
+
+ static {
+ clear();
+ }
+
+ public static boolean registerLanguageModel(NGramLanguageModel lm) {
+ long lock_stamp = lock.writeLock();
+ try {
+ // Store the language model.
+ LMs.add(lm);
+ // Notify it of all the existing words.
+ boolean collision = false;
+ for (int i = idToString.size() - 1; i > 0; i--)
+ collision = collision || lm.registerWord(idToString.get(i), i);
+ return collision;
+ } finally {
+ lock.unlockWrite(lock_stamp);
+ }
+ }
+
+ /**
+ * Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to
+ * reading the file.
+ *
+ * @param vocab_file path to a vocabulary file
+ * @return Returns true if vocabulary was read without mismatches or collisions.
+ * @throws IOException of the file cannot be found or read properly
+ */
+ public static boolean read(final File vocab_file) throws IOException {
+ DataInputStream vocab_stream =
+ new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
+ int size = vocab_stream.readInt();
+ LOG.info("Read {} entries from the vocabulary", size);
+ clear();
+ for (int i = 0; i < size; i++) {
+ int id = vocab_stream.readInt();
+ String token = vocab_stream.readUTF();
+ if (id != Math.abs(id(token))) {
+ vocab_stream.close();
+ return false;
+ }
+ }
+ vocab_stream.close();
+ return (size + 1 == idToString.size());
+ }
+
+ public static void write(String file_name) throws IOException {
+ long lock_stamp =lock.readLock();
+ try {
+ File vocab_file = new File(file_name);
+ DataOutputStream vocab_stream =
+ new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
+ vocab_stream.writeInt(idToString.size() - 1);
+ LOG.info("Writing vocabulary: {} tokens", idToString.size() - 1);
+ for (int i = 1; i < idToString.size(); i++) {
+ vocab_stream.writeInt(i);
+ vocab_stream.writeUTF(idToString.get(i));
+ }
+ vocab_stream.close();
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ /**
+ * Get the id of the token if it already exists, new id is created otherwise.
+ *
+ * TODO: currently locks for every call. Separate constant (frozen) ids from
+ * changing (e.g. OOV) ids. Constant ids could be immutable -> no locking.
+ * Alternatively: could we use ConcurrentHashMap to not have to lock if
+ * actually contains it and only lock for modifications?
+ *
+ * @param token a token to obtain an id for
+ * @return the token id
+ */
+ public static int id(String token) {
+ // First attempt an optimistic read
+ long attempt_read_lock = lock.tryOptimisticRead();
+ if (stringToId.containsKey(token)) {
+ int resultId = stringToId.get(token);
+ if (lock.validate(attempt_read_lock)) {
+ return resultId;
+ }
+ }
+
+ // The optimistic read failed, try a read with a stamped read lock
+ long read_lock_stamp = lock.readLock();
+ try {
+ if (stringToId.containsKey(token)) {
+ return stringToId.get(token);
+ }
+ } finally {
+ lock.unlockRead(read_lock_stamp);
+ }
+
+ // Looks like the id we want is not there, let's get a write lock and add it
+ long write_lock_stamp = lock.writeLock();
+ try {
+ if (stringToId.containsKey(token)) {
+ return stringToId.get(token);
+ }
+ int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);
+
+ // register this (token,id) mapping with each language
+ // model, so that they can map it to their own private
+ // vocabularies
+ for (NGramLanguageModel lm : LMs)
+ lm.registerWord(token, Math.abs(id));
+
+ idToString.add(token);
+ stringToId.put(token, id);
+ return id;
+ } finally {
+ lock.unlockWrite(write_lock_stamp);
+ }
+ }
+
+ public static boolean hasId(int id) {
+ long lock_stamp = lock.readLock();
+ try {
+ id = Math.abs(id);
+ return (id < idToString.size());
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static int[] addAll(String sentence) {
+ return addAll(sentence.split("\\s+"));
+ }
+
+ public static int[] addAll(String[] tokens) {
+ int[] ids = new int[tokens.length];
+ for (int i = 0; i < tokens.length; i++)
+ ids[i] = id(tokens[i]);
+ return ids;
+ }
+
+ public static String word(int id) {
+ long lock_stamp = lock.readLock();
+ try {
+ id = Math.abs(id);
+ return idToString.get(id);
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static String getWords(int[] ids) {
- if (ids.length == 0) return "";
++ return getWords(ids, " ");
++ }
++
++ public static String getWords(int[] ids, final String separator) {
++ if (ids.length == 0) {
++ return "";
++ }
+ StringBuilder sb = new StringBuilder();
- for (int i = 0; i < ids.length - 1; i++)
- sb.append(word(ids[i])).append(" ");
++ for (int i = 0; i < ids.length - 1; i++) {
++ sb.append(word(ids[i])).append(separator);
++ }
+ return sb.append(word(ids[ids.length - 1])).toString();
+ }
+
+ public static String getWords(final Iterable<Integer> ids) {
+ StringBuilder sb = new StringBuilder();
+ for (int id : ids)
+ sb.append(word(id)).append(" ");
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ public static int getUnknownId() {
+ return UNKNOWN_ID;
+ }
+
+ public static String getUnknownWord() {
+ return UNKNOWN_WORD;
+ }
+
+ public static int size() {
+ long lock_stamp = lock.readLock();
+ try {
+ return idToString.size();
+ } finally {
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static synchronized int getTargetNonterminalIndex(int id) {
+ return FormatUtils.getNonterminalIndex(word(id));
+ }
+
+ /**
+ * Clears the vocabulary and initializes it with an unknown word. Registered
+ * language models are left unchanged.
+ */
+ public static void clear() {
+ long lock_stamp = lock.writeLock();
+ try {
+ idToString = new ArrayList<String>();
+ stringToId = new HashMap<String, Integer>();
+
+ idToString.add(UNKNOWN_ID, UNKNOWN_WORD);
+ stringToId.put(UNKNOWN_WORD, UNKNOWN_ID);
+ } finally {
+ lock.unlockWrite(lock_stamp);
+ }
+ }
+
+ public static void unregisterLanguageModels() {
+ LMs.clear();
+ }
+
+ @Override
+ public void writeExternal(ObjectOutput out) throws IOException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void readExternal(ObjectInput in)
+ throws IOException, ClassNotFoundException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if(getClass() == o.getClass()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/Decoder.java
index 8535b11,0000000..6fa5eb8
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/Decoder.java
+++ b/src/main/java/org/apache/joshua/decoder/Decoder.java
@@@ -1,975 -1,0 +1,974 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+
+import com.google.common.base.Strings;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
+import org.apache.joshua.decoder.JoshuaConfiguration.SERVER_TYPE;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.PhraseModel;
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.lm.LanguageModelFF;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar;
+import org.apache.joshua.decoder.io.JSONMessage;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.apache.joshua.decoder.phrase.PhraseTable;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FileUtility;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles decoder initialization and the complication introduced by multithreading.
+ *
+ * After initialization, the main entry point to the Decoder object is
+ * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable
+ * Translations object. It is important that we support multithreading both (a) across the sentences
+ * within a request and (b) across requests, in a round-robin fashion. This is done by maintaining a
+ * fixed sized concurrent thread pool. When a new request comes in, a RequestParallelizer thread is
+ * launched. This object iterates over the request's sentences, obtaining a thread from the
+ * thread pool, and using that thread to decode the sentence. If a decoding thread is not available,
+ * it will block until one is in a fair (FIFO) manner. RequestParallelizer thereby permits intra-request
+ * parallelization by separating out reading the input stream from processing the translated sentences,
+ * but also ensures that round-robin parallelization occurs, since RequestParallelizer uses the
+ * thread pool before translating each request.
+ *
+ * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose
+ * of the runner is to record where to place the translated sentence when it is done (i.e., which
+ * Translations object). Translations itself is an iterator whose next() call blocks until the next
+ * translation is available.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Lane Schwartz dowobeha@users.sourceforge.net
+ */
+public class Decoder {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Decoder.class);
+
+ private final JoshuaConfiguration joshuaConfiguration;
+
+ public JoshuaConfiguration getJoshuaConfiguration() {
+ return joshuaConfiguration;
+ }
+
+ /*
+ * Many of these objects themselves are global objects. We pass them in when constructing other
+ * objects, so that they all share pointers to the same object. This is good because it reduces
+ * overhead, but it can be problematic because of unseen dependencies (for example, in the
+ * Vocabulary shared by language model, translation grammar, etc).
+ */
+ private List<Grammar> grammars;
+ private ArrayList<FeatureFunction> featureFunctions;
+ private PhraseTable customPhraseTable;
+
+ /* The feature weights. */
+ public static FeatureVector weights;
+
+ public static int VERBOSE = 1;
+
+ private BlockingQueue<DecoderThread> threadPool = null;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
+ /**
+ * Constructor method that creates a new decoder using the specified configuration file.
+ *
+ * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ * @param configFile name of configuration file.
+ */
+ public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) {
+ this(joshuaConfiguration);
+ this.initialize(configFile);
+ }
+
+ /**
+ * Factory method that creates a new decoder using the specified configuration file.
+ *
+ * @param configFile Name of configuration file.
+ * @return a configured {@link org.apache.joshua.decoder.Decoder}
+ */
+ public static Decoder createDecoder(String configFile) {
+ JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
+ return new Decoder(joshuaConfiguration, configFile);
+ }
+
+ /**
+ * Constructs an uninitialized decoder for use in testing.
+ * <p>
+ * This method is private because it should only ever be called by the
+ * {@link #getUninitalizedDecoder()} method to provide an uninitialized decoder for use in
+ * testing.
+ */
+ private Decoder(JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.grammars = new ArrayList<Grammar>();
+ this.threadPool = new ArrayBlockingQueue<DecoderThread>(
+ this.joshuaConfiguration.num_parallel_decoders, true);
+ this.customPhraseTable = null;
+ }
+
+ /**
+ * Gets an uninitialized decoder for use in testing.
+ * <p>
+ * This method is called by unit tests or any outside packages (e.g., MERT) relying on the
+ * decoder.
+ * @param joshuaConfiguration a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+ * @return an uninitialized decoder for use in testing
+ */
+ static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) {
+ return new Decoder(joshuaConfiguration);
+ }
+
+ // ===============================================================
+ // Public Methods
+ // ===============================================================
+
+ /**
+ * This class is responsible for getting sentences from the TranslationRequest and procuring a
+ * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a
+ * thread that will read the request's sentences, obtain a DecoderThread to translate them, and
+ * then place the Translation in the appropriate place.
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ *
+ */
+ private class RequestParallelizer extends Thread {
+ /* Source of sentences to translate. */
+ private final TranslationRequestStream request;
+
+ /* Where to put translated sentences. */
+ private final Translations response;
+
+ /* Sometimes we need to communicate with the client even when we didn't get a new sentence
+ * (e.g., metadata)
+ */
+ private OutputStream out;
+
+ RequestParallelizer(TranslationRequestStream request, Translations response, OutputStream out) {
+ this.request = request;
+ this.response = response;
+ this.out = out;
+ }
+
+ @Override
+ public void run() {
+ /*
+ * Repeatedly get an input sentence, wait for a DecoderThread, and then start a new thread to
+ * translate the sentence. We start a new thread (via DecoderRunnerThread) as opposed to
+ * blocking, so that the RequestHandler can go on to the next sentence in this request, which
+ * allows parallelization across the sentences of the request.
+ */
+ for (;;) {
+ Sentence sentence = null;
+ try {
+ sentence = request.next();
+
+ } catch (MetaDataException meta) {
+ try {
+ handleMetadata(meta);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ continue;
+ }
+
+ if (sentence == null) {
+ response.finish();
+ break;
+ }
+
+ // This will block until a DecoderThread becomes available.
+ DecoderThread thread = Decoder.this.getThread();
+ new DecoderThreadRunner(thread, sentence, response).start();
+ }
+ }
+
+ /**
+ * When metadata is found on the input, it needs to be processed. That is done here. Sometimes
+ * this involves returning data to the client.
+ *
+ * @param meta
+ * @throws IOException
+ */
+ private void handleMetadata(MetaDataException meta) throws IOException {
+ if (meta.type().equals("set_weight")) {
+ // Change a decoder weight
+ String[] tokens = meta.tokens();
+ if (tokens.length != 3) {
+ LOG.error("weight change requires three tokens");
+ } else {
+ float old_weight = Decoder.weights.getWeight(tokens[1]);
+ Decoder.weights.set(tokens[1], Float.parseFloat(tokens[2]));
+ LOG.error("@set_weight: {} {} -> {}", tokens[1], old_weight,
+ Decoder.weights.getWeight(tokens[1]));
+ }
+
+ // TODO: return a JSON object with this weight or all weights
+ out.write("".getBytes());
+
+ } else if (meta.type().equals("get_weight")) {
+ // TODO: add to JSON object, send back
+
+ String[] tokens = meta.tokens();
+
+ LOG.error("{} = {}", tokens[1], Decoder.weights.getWeight(tokens[1]));
+
+ out.write("".getBytes());
+
+ } else if (meta.type().equals("add_rule")) {
+ String tokens[] = meta.tokens(" \\|\\|\\| ");
+
+ if (tokens.length != 2) {
+ LOG.error("* INVALID RULE '{}'", meta);
+ out.write("bad rule".getBytes());
+ return;
+ }
+
+ Rule rule = new HieroFormatReader().parseLine(
+ String.format("[X] ||| [X,1] %s ||| [X,1] %s ||| custom=1", tokens[0], tokens[1]));
+ Decoder.this.customPhraseTable.addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ LOG.info("Added custom rule {}", formatRule(rule));
+
+ String response = String.format("Added rule %s", formatRule(rule));
+ out.write(response.getBytes());
+
+ } else if (meta.type().equals("list_rules")) {
+
+ JSONMessage message = new JSONMessage();
+
+ // Walk the the grammar trie
+ ArrayList<Trie> nodes = new ArrayList<Trie>();
+ nodes.add(customPhraseTable.getTrieRoot());
+
+ while (nodes.size() > 0) {
+ Trie trie = nodes.remove(0);
+
+ if (trie == null)
+ continue;
+
+ if (trie.hasRules()) {
+ for (Rule rule: trie.getRuleCollection().getRules()) {
+ message.addRule(formatRule(rule));
+ }
+ }
+
+ if (trie.getExtensions() != null)
+ nodes.addAll(trie.getExtensions());
+ }
+
+ out.write(message.toString().getBytes());
+
+ } else if (meta.type().equals("remove_rule")) {
+ // Remove a rule from a custom grammar, if present
+ String[] tokens = meta.tokenString().split(" \\|\\|\\| ");
+ if (tokens.length != 2) {
+ out.write(String.format("Invalid delete request: '%s'", meta.tokenString()).getBytes());
+ return;
+ }
+
+ // Search for the rule in the trie
+ int nt_i = Vocabulary.id(joshuaConfiguration.default_non_terminal);
+ Trie trie = customPhraseTable.getTrieRoot().match(nt_i);
+
+ for (String word: tokens[0].split("\\s+")) {
+ int id = Vocabulary.id(word);
+ Trie nextTrie = trie.match(id);
+ if (nextTrie != null)
+ trie = nextTrie;
+ }
+
+ if (trie.hasRules()) {
+ Rule matched = null;
+ for (Rule rule: trie.getRuleCollection().getRules()) {
+ String target = rule.getEnglishWords();
+ target = target.substring(target.indexOf(' ') + 1);
+
+ if (tokens[1].equals(target)) {
+ matched = rule;
+ break;
+ }
+ }
+ trie.getRuleCollection().getRules().remove(matched);
+ out.write(String.format("Removed rule %s", formatRule(matched)).getBytes());
+ return;
+ }
+
+ out.write(String.format("No such rule %s", meta.tokenString()).getBytes());
+ }
+ }
+
+ /**
+ * Strips the nonterminals from the lefthand side of the rule.
+ *
+ * @param rule
+ * @return
+ */
+ private String formatRule(Rule rule) {
+ String ruleString = "";
+ boolean first = true;
+ for (int word: rule.getFrench()) {
+ if (!first)
+ ruleString += " " + Vocabulary.word(word);
+ first = false;
+ }
+
+ ruleString += " |||"; // space will get added with first English word
+ first = true;
+ for (int word: rule.getEnglish()) {
+ if (!first)
+ ruleString += " " + Vocabulary.word(word);
+ first = false;
+ }
+
+ // strip of the leading space
+ return ruleString.substring(1);
+ }
+ }
+
+ /**
+ * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in
+ * a fair fashion (i.e,. FIFO across requests).
+ *
+ * @return a thread that can be used for decoding.
+ */
+ public DecoderThread getThread() {
+ try {
+ return threadPool.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * This class handles running a DecoderThread (which takes care of the actual translation of an
+ * input Sentence, returning a Translation object when its done). This is done in a thread so as
+ * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in
+ * the TranslationRequest, in turn permitting parallelization across the sentences of a request.
+ *
+ * When the decoder thread is finshed, the Translation object is placed in the correct place in
+ * the corresponding Translations object that was returned to the caller of
+ * Decoder.decodeAll(TranslationRequest).
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+ private class DecoderThreadRunner extends Thread {
+
+ private final DecoderThread decoderThread;
+ private final Sentence sentence;
+ private final Translations translations;
+
+ DecoderThreadRunner(DecoderThread thread, Sentence sentence, Translations translations) {
+ this.decoderThread = thread;
+ this.sentence = sentence;
+ this.translations = translations;
+ }
+
+ @Override
+ public void run() {
+ /*
+ * Use the thread to translate the sentence. Then record the translation with the
+ * corresponding Translations object, and return the thread to the pool.
+ */
+ try {
+ Translation translation = decoderThread.translate(this.sentence);
+ translations.record(translation);
+
+ /*
+ * This is crucial! It's what makes the thread available for the next sentence to be
+ * translated.
+ */
+ threadPool.put(decoderThread);
+ } catch (Exception e) {
+ throw new RuntimeException(String.format(
+ "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage()), e);
+ // translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration));
+ }
+ }
+ }
+
+ /**
+ * This function is the main entry point into the decoder. It translates all the sentences in a
+ * (possibly boundless) set of input sentences. Each request launches its own thread to read the
+ * sentences of the request.
+ *
+ * @param request the populated {@link org.apache.joshua.decoder.io.TranslationRequestStream}
+ * @param out an appropriate {@link java.io.OutputStream} to write results to
+ * @throws IOException if there is an error with the input stream or writing the output
+ */
+ public void decodeAll(TranslationRequestStream request, OutputStream out) throws IOException {
+ Translations translations = new Translations(request);
+
+ /* Start a thread to handle requests on the input stream */
+ new RequestParallelizer(request, translations, out).start();
+
+ // Create the n-best output stream
+ FileWriter nbest_out = null;
+ if (joshuaConfiguration.n_best_file != null)
+ nbest_out = new FileWriter(joshuaConfiguration.n_best_file);
+
+ for (;;) {
+ Translation translation = translations.next();
+ if (translation == null)
+ break;
+
+ if (joshuaConfiguration.input_type == INPUT_TYPE.json || joshuaConfiguration.server_type == SERVER_TYPE.HTTP) {
+ JSONMessage message = JSONMessage.buildMessage(translation);
+ out.write(message.toString().getBytes());
+
+ } else {
+ /**
+ * We need to munge the feature value outputs in order to be compatible with Moses tuners.
+ * Whereas Joshua writes to STDOUT whatever is specified in the `output-format` parameter,
+ * Moses expects the simple translation on STDOUT and the n-best list in a file with a fixed
+ * format.
+ */
+ String text;
+ if (joshuaConfiguration.moses) {
+ text = translation.toString().replaceAll("=", "= ");
+ // Write the complete formatted string to STDOUT
+ if (joshuaConfiguration.n_best_file != null)
+ nbest_out.write(text);
+
+ // Extract just the translation and output that to STDOUT
+ text = text.substring(0, text.indexOf('\n'));
+ String[] fields = text.split(" \\|\\|\\| ");
+ text = fields[1] + "\n";
+
+ } else {
+ text = translation.toString();
+ }
+
+ out.write(text.getBytes());
+ }
+ out.flush();
+ }
+
+ if (joshuaConfiguration.n_best_file != null)
+ nbest_out.close();
+ }
+
+
+ /**
+ * We can also just decode a single sentence.
+ *
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the sentence {@link org.apache.joshua.decoder.Translation}
+ */
+ public Translation decode(Sentence sentence) {
+ // Get a thread.
+
+ try {
+ DecoderThread thread = threadPool.take();
+ Translation translation = thread.translate(sentence);
+ threadPool.put(thread);
+
+ return translation;
+
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ return null;
+ }
+
+ /**
+ * Clean shutdown of Decoder, resetting all
+ * static variables, such that any other instance of Decoder
+ * afterwards gets a fresh start.
+ */
+ public void cleanUp() {
+ // shut down DecoderThreads
+ for (DecoderThread thread : threadPool) {
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ resetGlobalState();
+ }
+
+ public static void resetGlobalState() {
+ // clear/reset static variables
+ DENSE_FEATURE_NAMES.clear();
+ Vocabulary.clear();
+ Vocabulary.unregisterLanguageModels();
+ LanguageModelFF.resetLmIndex();
+ StatefulFF.resetGlobalStateIndex();
+ }
+
+ public static void writeConfigFile(double[] newWeights, String template, String outputFile,
+ String newDiscriminativeModel) {
+ try {
+ int columnID = 0;
+
+ BufferedWriter writer = FileUtility.getWriteFileStream(outputFile);
+ LineReader reader = new LineReader(template);
+ try {
+ for (String line : reader) {
+ line = line.trim();
+ if (Regex.commentOrEmptyLine.matches(line) || line.indexOf("=") != -1) {
+ // comment, empty line, or parameter lines: just copy
+ writer.write(line);
+ writer.newLine();
+
+ } else { // models: replace the weight
+ String[] fds = Regex.spaces.split(line);
+ StringBuffer newSent = new StringBuffer();
+ if (!Regex.floatingNumber.matches(fds[fds.length - 1])) {
+ throw new IllegalArgumentException("last field is not a number; the field is: "
+ + fds[fds.length - 1]);
+ }
+
+ if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) {
+ newSent.append(fds[0]).append(' ');
+ newSent.append(newDiscriminativeModel).append(' ');// change the
+ // file name
+ for (int i = 2; i < fds.length - 1; i++) {
+ newSent.append(fds[i]).append(' ');
+ }
+ } else {// regular
+ for (int i = 0; i < fds.length - 1; i++) {
+ newSent.append(fds[i]).append(' ');
+ }
+ }
+ if (newWeights != null)
+ newSent.append(newWeights[columnID++]);// change the weight
+ else
+ newSent.append(fds[fds.length - 1]);// do not change
+
+ writer.write(newSent.toString());
+ writer.newLine();
+ }
+ }
+ } finally {
+ reader.close();
+ writer.close();
+ }
+
+ if (newWeights != null && columnID != newWeights.length) {
+ throw new IllegalArgumentException("number of models does not match number of weights");
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ // ===============================================================
+ // Initialization Methods
+ // ===============================================================
+
+ /**
+ * Moses requires the pattern .*_.* for sparse features, and prohibits underscores in dense features.
+ * This conforms to that pattern. We assume non-conforming dense features start with tm_ or lm_,
+ * and the only sparse feature that needs converting is OOVPenalty.
+ *
+ * @param feature
+ * @return the feature in Moses format
+ */
+ private String mosesize(String feature) {
+ if (joshuaConfiguration.moses) {
+ if (feature.startsWith("tm_") || feature.startsWith("lm_"))
+ return feature.replace("_", "-");
+ }
+
+ return feature;
+ }
+
+ /**
+ * Initialize all parts of the JoshuaDecoder.
+ *
+ * @param configFile File containing configuration options
+ * @return An initialized decoder
+ */
+ public Decoder initialize(String configFile) {
+ try {
+
+ long pre_load_time = System.currentTimeMillis();
+
+ /* Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
+ * in the Joshua config file. Config file values take precedent.
+ */
+ this.readWeights(joshuaConfiguration.weights_file);
+
+
+ /* Add command-line-passed weights to the weights array for processing below */
+ if (!Strings.isNullOrEmpty(joshuaConfiguration.weight_overwrite)) {
+ String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
+ for (int i = 0; i < tokens.length; i += 2) {
+ String feature = tokens[i];
+ float value = Float.parseFloat(tokens[i+1]);
+
+ if (joshuaConfiguration.moses)
+ feature = demoses(feature);
+
+ joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1]));
+ LOG.info("COMMAND LINE WEIGHT: {} -> {}", feature, value);
+ }
+ }
+
+ /* Read the weights found in the config file */
+ for (String pairStr: joshuaConfiguration.weights) {
+ String pair[] = pairStr.split("\\s+");
+
+ /* Sanity check for old-style unsupported feature invocations. */
+ if (pair.length != 2) {
+ StringBuilder errMsg = new StringBuilder();
+ errMsg.append("FATAL: Invalid feature weight line found in config file.\n");
+ errMsg.append(String.format("The line was '%s'\n", pairStr));
+ errMsg.append("You might be using an old version of the config file that is no longer supported\n");
+ errMsg.append("Check joshua-decoder.org or email joshua_support@googlegroups.com for help\n");
+ errMsg.append("Code = " + 17);
+ throw new RuntimeException(errMsg.toString());
+ }
+
+ weights.set(pair[0], Float.parseFloat(pair[1]));
+ }
+
+ LOG.info("Read {} weights ({} of them dense)", weights.size(), DENSE_FEATURE_NAMES.size());
+
+ // Do this before loading the grammars and the LM.
+ this.featureFunctions = new ArrayList<FeatureFunction>();
+
+ // Initialize and load grammars. This must happen first, since the vocab gets defined by
+ // the packed grammar (if any)
+ this.initializeTranslationGrammars();
+ LOG.info("Grammar loading took: {} seconds.",
+ (System.currentTimeMillis() - pre_load_time) / 1000);
+
+ // Initialize the features: requires that LM model has been initialized.
+ this.initializeFeatureFunctions();
+
+ // This is mostly for compatibility with the Moses tuning script
+ if (joshuaConfiguration.show_weights_and_quit) {
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ String name = DENSE_FEATURE_NAMES.get(i);
+ if (joshuaConfiguration.moses)
+ System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i)));
+ else
+ System.out.println(String.format("%s %.5f", name, weights.getDense(i)));
+ }
+ System.exit(0);
+ }
+
+ // Sort the TM grammars (needed to do cube pruning)
+ if (joshuaConfiguration.amortized_sorting) {
+ LOG.info("Grammar sorting happening lazily on-demand.");
+ } else {
+ long pre_sort_time = System.currentTimeMillis();
+ for (Grammar grammar : this.grammars) {
+ grammar.sortGrammar(this.featureFunctions);
+ }
+ LOG.info("Grammar sorting took {} seconds.",
+ (System.currentTimeMillis() - pre_sort_time) / 1000);
+ }
+
+ // Create the threads
+ for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
+ this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
+ this.featureFunctions, joshuaConfiguration));
+ }
+ } catch (IOException | InterruptedException e) {
+ LOG.warn(e.getMessage(), e);
+ }
+
+ return this;
+ }
+
+ /**
+ * Initializes translation grammars Retained for backward compatibility
+ *
+ * @param ownersSeen Records which PhraseModelFF's have been instantiated (one is needed for each
+ * owner)
+ * @throws IOException
+ */
+ private void initializeTranslationGrammars() throws IOException {
+
+ if (joshuaConfiguration.tms.size() > 0) {
+
+ // collect packedGrammars to check if they use a shared vocabulary
+ final List<PackedGrammar> packed_grammars = new ArrayList<>();
+
+ // tm = {thrax/hiero,packed,samt,moses} OWNER LIMIT FILE
+ for (String tmLine : joshuaConfiguration.tms) {
+
+ String type = tmLine.substring(0, tmLine.indexOf(' '));
+ String[] args = tmLine.substring(tmLine.indexOf(' ')).trim().split("\\s+");
+ HashMap<String, String> parsedArgs = FeatureFunction.parseArgs(args);
+
+ String owner = parsedArgs.get("owner");
+ int span_limit = Integer.parseInt(parsedArgs.get("maxspan"));
+ String path = parsedArgs.get("path");
+
+ Grammar grammar = null;
+ if (! type.equals("moses") && ! type.equals("phrase")) {
+ if (new File(path).isDirectory()) {
+ try {
+ PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, joshuaConfiguration);
+ packed_grammars.add(packed_grammar);
+ grammar = packed_grammar;
+ } catch (FileNotFoundException e) {
+ String msg = String.format("Couldn't load packed grammar from '%s'", path)
+ + "Perhaps it doesn't exist, or it may be an old packed file format.";
+ throw new RuntimeException(e);
+ }
+ } else {
+ // thrax, hiero, samt
+ grammar = new MemoryBasedBatchGrammar(type, path, owner,
+ joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
+ }
+
+ } else {
+
+ int maxSourceLen = parsedArgs.containsKey("max-source-len")
+ ? Integer.parseInt(parsedArgs.get("max-source-len"))
+ : -1;
+
+ joshuaConfiguration.search_algorithm = "stack";
+ grammar = new PhraseTable(path, owner, type, joshuaConfiguration);
+ }
+
+ this.grammars.add(grammar);
+ }
+
+ checkSharedVocabularyChecksumsForPackedGrammars(packed_grammars);
+
+ } else {
+ LOG.warn("no grammars supplied! Supplying dummy glue grammar.");
+ MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration);
+ glueGrammar.setSpanLimit(-1);
+ glueGrammar.addGlueRules(featureFunctions);
+ this.grammars.add(glueGrammar);
+ }
+
+ /* Add the grammar for custom entries */
+ this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration);
+ this.grammars.add(this.customPhraseTable);
+
+ /* Create an epsilon-deleting grammar */
+ if (joshuaConfiguration.lattice_decoding) {
+ LOG.info("Creating an epsilon-deleting grammar");
+ MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
+ latticeGrammar.setSpanLimit(-1);
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ //FIXME: too many arguments
+ String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT);
+
+ Rule rule = reader.parseLine(ruleString);
+ latticeGrammar.addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+
+ this.grammars.add(latticeGrammar);
+ }
+
+ /* Now create a feature function for each owner */
+ HashSet<String> ownersSeen = new HashSet<String>();
+
+ for (Grammar grammar: this.grammars) {
+ String owner = Vocabulary.word(grammar.getOwner());
+ if (! ownersSeen.contains(owner)) {
+ this.featureFunctions.add(new PhraseModel(weights, new String[] { "tm", "-owner", owner },
+ joshuaConfiguration, grammar));
+ ownersSeen.add(owner);
+ }
+ }
+
+ LOG.info("Memory used {} MB",
+ ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
+ }
+
+ /**
+ * Checks if multiple packedGrammars have the same vocabulary by comparing their vocabulary file checksums.
+ */
+ private static void checkSharedVocabularyChecksumsForPackedGrammars(final List<PackedGrammar> packed_grammars) {
+ String previous_checksum = "";
+ for (PackedGrammar grammar : packed_grammars) {
+ final String checksum = grammar.computeVocabularyChecksum();
+ if (previous_checksum.isEmpty()) {
+ previous_checksum = checksum;
+ } else {
+ if (!checksum.equals(previous_checksum)) {
+ throw new RuntimeException(
+ "Trying to load multiple packed grammars with different vocabularies!" +
+ "Have you packed them jointly?");
+ }
+ previous_checksum = checksum;
+ }
+ }
+ }
+
+ /*
+ * This function reads the weights for the model. Feature names and their weights are listed one
+ * per line in the following format:
+ *
+ * FEATURE_NAME WEIGHT
+ */
+ private void readWeights(String fileName) {
+ Decoder.weights = new FeatureVector();
+
+ if (fileName.equals(""))
+ return;
+
+ try {
+ LineReader lineReader = new LineReader(fileName);
+
+ for (String line : lineReader) {
+ line = line.replaceAll("\\s+", " ");
+
+ if (line.equals("") || line.startsWith("#") || line.startsWith("//")
+ || line.indexOf(' ') == -1)
+ continue;
+
+ String tokens[] = line.split("\\s+");
+ String feature = tokens[0];
+ Float value = Float.parseFloat(tokens[1]);
+
+ // Kludge for compatibility with Moses tuners
+ if (joshuaConfiguration.moses) {
+ feature = demoses(feature);
+ }
+
+ weights.increment(feature, value);
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ LOG.info("Read {} weights from file '{}'", weights.size(), fileName);
+ }
+
+ private String demoses(String feature) {
+ if (feature.endsWith("="))
+ feature = feature.replace("=", "");
+ if (feature.equals("OOV_Penalty"))
+ feature = "OOVPenalty";
+ else if (feature.startsWith("tm-") || feature.startsWith("lm-"))
+ feature = feature.replace("-", "_");
+ return feature;
+ }
+
+ /**
+ * Feature functions are instantiated with a line of the form
+ *
+ * <pre>
- * feature_function = FEATURE OPTIONS
++ * FEATURE OPTIONS
+ * </pre>
+ *
+ * Weights for features are listed separately.
+ *
+ * @throws IOException
+ *
+ */
+ private void initializeFeatureFunctions() throws IOException {
+
+ for (String featureLine : joshuaConfiguration.features) {
- // feature-function = NAME args
++ // line starts with NAME, followed by args
+ // 1. create new class named NAME, pass it config, weights, and the args
+
- // Get rid of the leading crap.
- featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");
-
+ String fields[] = featureLine.split("\\s+");
+ String featureName = fields[0];
++
+ try {
++
+ Class<?> clas = getClass(featureName);
+ Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
+ String[].class, JoshuaConfiguration.class);
- this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
++ FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
++ this.featureFunctions.add(feature);
++
+ } catch (Exception e) {
- e.printStackTrace();
- throw new RuntimeException("* FATAL: could not find a feature '" + featureName + "'");
++ throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e);
+ }
+ }
+
+ for (FeatureFunction feature : featureFunctions) {
+ LOG.info("FEATURE: {}", feature.logString());
-
+ }
+
+ weights.registerDenseFeatures(featureFunctions);
+ }
+
+ /**
+ * Searches a list of predefined paths for classes, and returns the first one found. Meant for
+ * instantiating feature functions.
+ *
+ * @param name
+ * @return the class, found in one of the search paths
+ * @throws ClassNotFoundException
+ */
+ private Class<?> getClass(String featureName) {
+ Class<?> clas = null;
+
+ String[] packages = { "org.apache.joshua.decoder.ff", "org.apache.joshua.decoder.ff.lm", "org.apache.joshua.decoder.ff.phrase" };
+ for (String path : packages) {
+ try {
+ clas = Class.forName(String.format("%s.%s", path, featureName));
+ break;
+ } catch (ClassNotFoundException e) {
+ try {
+ clas = Class.forName(String.format("%s.%sFF", path, featureName));
+ break;
+ } catch (ClassNotFoundException e2) {
+ // do nothing
+ }
+ }
+ }
+ return clas;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
index 5acfd7e,0000000..dd7bafb
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
@@@ -1,712 -1,0 +1,712 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
+import static org.apache.joshua.util.FormatUtils.ensureNonTerminalBrackets;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.fragmentlm.Tree;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Configuration file for Joshua decoder.
+ *
+ * When adding new features to Joshua, any new configurable parameters should be added to this
+ * class.
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class JoshuaConfiguration {
+
+ private static final Logger LOG = LoggerFactory.getLogger(JoshuaConfiguration.class);
+
+ // whether to construct a StructuredTranslation object for each request instead of
+ // printing to stdout. Used when the Decoder is used from Java directly.
+ public Boolean use_structured_output = false;
+
+ // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+ // original case
+ public boolean lowercase = false;
+
+ // If set to true, Joshua will recapitalize the output by projecting the case from aligned
+ // source-side words
+ public boolean project_case = false;
+
+ // List of grammar files to read
+ public ArrayList<String> tms = new ArrayList<String>();
+
+ // A rule cache for commonly used tries to avoid excess object allocations
+ // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
+ public Integer cachedRuleSize = new Integer(5000);
+
+ /*
+ * The file to read the weights from (part of the sparse features implementation). Weights can
+ * also just be listed in the main config file.
+ */
+ public String weights_file = "";
+ // Default symbols. The symbol here should be enclosed in square brackets.
+ public String default_non_terminal = FormatUtils.ensureNonTerminalBrackets("X");
+ public String goal_symbol = FormatUtils.ensureNonTerminalBrackets("GOAL");
+
+ /*
+ * A list of OOV symbols in the form
+ *
+ * [X1] weight [X2] weight [X3] weight ...
+ *
+ * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
+ * input sentence, Joshua will create rules of the form
+ *
+ * X1 -> w (weight)
+ *
+ * If this is empty, an unweighted default_non_terminal is used.
+ */
+ public class OOVItem implements Comparable<OOVItem> {
+ public String label;
+
+ public float weight;
+
+ OOVItem(String l, float w) {
+ label = l;
+ weight = w;
+ }
+ @Override
+ public int compareTo(OOVItem other) {
+ if (weight > other.weight)
+ return -1;
+ else if (weight < other.weight)
+ return 1;
+ return 0;
+ }
+ }
+
+ public ArrayList<OOVItem> oovList = null;
+
+ /*
+ * Whether to segment OOVs into a lattice
+ */
+ public boolean segment_oovs = false;
+
+ /*
+ * Enable lattice decoding.
+ */
+ public boolean lattice_decoding = false;
+
+ /*
+ * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
+ * sorted till they are first accessed. Amortized sorting means you get your first translation
+ * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
+ */
+ public boolean amortized_sorting = true;
+ // syntax-constrained decoding
+ public boolean constrain_parse = false;
+
+ public boolean use_pos_labels = false;
+
+ // oov-specific
+ public boolean true_oovs_only = false;
+
+ /* Dynamic sentence-level filtering. */
+ public boolean filter_grammar = false;
+
+ /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
+ public int pop_limit = 100;
+
+ /* Maximum sentence length. Sentences longer than this are truncated. */
+ public int maxlen = 200;
+
+ /*
+ * N-best configuration.
+ */
+ // Make sure output strings in the n-best list are unique.
+ public boolean use_unique_nbest = true;
+
+ /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
+ public boolean include_align_index = false;
+
+ /* The number of hypotheses to output by default. */
+ public int topN = 1;
+
+ /**
+ * This string describes the format of each line of output from the decoder (i.e., the
+ * translations). The string can include arbitrary text and also variables. The following
+ * variables are available:
+ *
+ * <pre>
+ * - %i the 0-indexed sentence number
+ * - %e the source string %s the translated sentence
+ * - %S the translated sentence with some basic capitalization and denormalization
+ * - %t the synchronous derivation
+ * - %f the list of feature values (as name=value pairs)
+ * - %c the model cost
+ * - %w the weight vector
+ * - %a the alignments between source and target words (currently unimplemented)
+ * - %d a verbose, many-line version of the derivation
+ * </pre>
+ */
+ public String outputFormat = "%i ||| %s ||| %f ||| %c";
+
+ /* The number of decoding threads to use (-threads). */
+ public int num_parallel_decoders = 1;
+
+ // disk hg
+ public String hypergraphFilePattern = "";
+
+ /*
+ * When true, _OOV is appended to all words that are passed through (useful for something like
+ * transliteration on the target side
+ */
+ public boolean mark_oovs = false;
+
+ /* Enables synchronous parsing. */
+ public boolean parse = false; // perform synchronous parsing
+
+
+ /* A list of the feature functions. */
+ public ArrayList<String> features = new ArrayList<String>();
+
+ /* A list of weights found in the main config file (instead of in a separate weights file) */
+ public ArrayList<String> weights = new ArrayList<String>();
+
+ /* Determines whether to expect JSON input or plain lines */
+ public enum INPUT_TYPE { plain, json };
+ public INPUT_TYPE input_type = INPUT_TYPE.plain;
+
+ /* Type of server. Not sure we need to keep the regular TCP one around. */
+ public enum SERVER_TYPE { none, TCP, HTTP };
+ public SERVER_TYPE server_type = SERVER_TYPE.TCP;
+
+ /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
+ public int server_port = 0;
+
+ /*
+ * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
+ * the input sentences in the following format:
+ *
+ * input sentence ||| ||| reference1 ||| reference2 ...
+ *
+ * (The second field is reserved for the output sentence for alignment and forced decoding).
+ */
+
+ public boolean rescoreForest = false;
+ public float rescoreForestWeight = 10.0f;
+
+ /*
+ * Location of fragment mapping file, which maps flattened SCFG rules to their internal
+ * representation.
+ */
+ public String fragmentMapFile = null;
+
+ /*
+ * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
+ * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
+ */
+ public boolean fuzzy_matching = false;
+
+ public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
+
+ /***
+ * Phrase-based decoding parameters.
+ */
+
+ /* The search algorithm: currently either "cky" or "stack" */
+ public String search_algorithm = "cky";
+
+ /* The distortion limit */
+ public int reordering_limit = 8;
+
+ /* The number of target sides considered for each source side (after sorting by model weight) */
+ public int num_translation_options = 20;
+
+ /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
+ * version of Sennrich (SSST 2014)
+ */
+ public boolean use_dot_chart = true;
+
+ /* Moses compatibility */
+ public boolean moses = false;
+
+ /* If true, just print out the weights found in the config file, and exit. */
+ public boolean show_weights_and_quit = false;
+
+ /* Read input from a file (Moses compatible flag) */
+ public String input_file = null;
+
+ /* Write n-best output to this file */
+ public String n_best_file = null;
+
+ /* Whether to look at source side for special annotations */
+ public boolean source_annotations = false;
+
+ /* Weights overridden from the command line */
+ public String weight_overwrite = "";
+
+ /**
+ * This method resets the state of JoshuaConfiguration back to the state after initialization.
+ * This is useful when for example making different calls to the decoder within the same java
+ * program, which otherwise leads to potential errors due to inconsistent state as a result of
+ * loading the configuration multiple times without resetting etc.
+ *
+ * This leads to the insight that in fact it may be an even better idea to refactor the code and
+ * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
+ * shared static object. This is just a suggestion for the next step.
+ *
+ */
+ public void reset() {
+ LOG.info("Resetting the JoshuaConfiguration to its defaults ...");
+ LOG.info("\n\tResetting the StatefullFF global state index ...");
+ LOG.info("\n\t...done");
+ StatefulFF.resetGlobalStateIndex();
+ tms = new ArrayList<String>();
+ weights_file = "";
+ default_non_terminal = "[X]";
+ oovList = new ArrayList<OOVItem>();
+ oovList.add(new OOVItem(default_non_terminal, 1.0f));
+ goal_symbol = "[GOAL]";
+ amortized_sorting = true;
+ constrain_parse = false;
+ use_pos_labels = false;
+ true_oovs_only = false;
+ filter_grammar = false;
+ pop_limit = 100;
+ maxlen = 200;
+ use_unique_nbest = false;
+ include_align_index = false;
+ topN = 1;
+ outputFormat = "%i ||| %s ||| %f ||| %c";
+ num_parallel_decoders = 1;
+ hypergraphFilePattern = "";
+ mark_oovs = false;
+ // oracleFile = null;
+ parse = false; // perform synchronous parsing
+ features = new ArrayList<String>();
+ weights = new ArrayList<String>();
+ server_port = 0;
+
+ reordering_limit = 8;
+ num_translation_options = 20;
+ LOG.info("...done");
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ /**
+ * To process command-line options, we write them to a file that looks like the config file, and
+ * then call readConfigFile() on it. It would be more general to define a class that sits on a
+ * stream and knows how to chop it up, but this was quicker to implement.
+ *
+ * @param options string array of command line options
+ */
+ public void processCommandLineOptions(String[] options) {
+ try {
+ File tmpFile = File.createTempFile("options", null, null);
+ PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
+
+ for (int i = 0; i < options.length; i++) {
+ String key = options[i].substring(1);
+ if (i + 1 == options.length || options[i + 1].startsWith("-")) {
+ // if this is the last item, or if the next item
+ // is another flag, then this is a boolean flag
+ out.println(key + " = true");
+
+ } else {
+ out.print(key + " =");
+ while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
+ out.print(String.format(" %s", options[i + 1]));
+ i++;
+ }
+ out.println();
+ }
+ }
+ out.close();
+ this.readConfigFile(tmpFile.getCanonicalPath());
+
+ tmpFile.delete();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void readConfigFile(String configFile) throws IOException {
+
+ LineReader configReader = new LineReader(configFile, false);
+ try {
+ for (String line : configReader) {
+ line = line.trim(); // .toLowerCase();
+
+ if (Regex.commentOrEmptyLine.matches(line))
+ continue;
+
+ /*
+ * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
+ * values. Parameters match the pattern "key = value"; all other substantive lines are
+ * interpreted as features.
+ */
+
+ if (line.indexOf("=") != -1) { // parameters; (not feature function)
+ String[] fds = Regex.equalsWithSpaces.split(line, 2);
+ if (fds.length < 2) {
+ LOG.warn("skipping config file line '{}'", line);
+ continue;
+ }
+
+ String parameter = normalize_key(fds[0]);
+
+ if (parameter.equals(normalize_key("lm"))) {
+ /* This is deprecated. This support old LM lines of the form
+ *
+ * lm = berkeleylm 5 false false 100 lm.gz
+ *
+ * LMs are now loaded as general feature functions, so we transform that to either
+ *
- * feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
++ * LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+ *
+ * If the line were state minimizing:
+ *
+ * lm = kenlm 5 true false 100 lm.gz
+ *
- * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
++ * StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+ */
+
+ String[] tokens = fds[1].split("\\s+");
+ if (tokens[2].equals("true"))
- features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
++ features.add(String.format("StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+ tokens[1], tokens[5]));
+ else
- features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
++ features.add(String.format("LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+ tokens[0], tokens[1], tokens[5]));
+
+ } else if (parameter.equals(normalize_key("tm"))) {
+ /* If found, convert old format:
+ * tm = TYPE OWNER MAXSPAN PATH
+ * to new format
+ * tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH
+ */
+ String tmLine = fds[1];
+
+ String[] tokens = fds[1].split("\\s+");
+ if (! tokens[1].startsWith("-")) { // old format
+ tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
+ LOG.warn("Converting deprecated TM line from '{}' -> '{}'", fds[1], tmLine);
+ }
+ tms.add(tmLine);
+
+ } else if (parameter.equals("v")) {
+ Decoder.VERBOSE = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("parse"))) {
+ parse = Boolean.parseBoolean(fds[1]);
+ LOG.debug("parse: {}", parse);
+
+ } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
+ hypergraphFilePattern = fds[1].trim();
+ LOG.debug(" hypergraph dump file format: {}", hypergraphFilePattern);
+
+ } else if (parameter.equals(normalize_key("oov-list"))) {
+ if (new File(fds[1]).exists()) {
+ oovList = new ArrayList<OOVItem>();
+ try {
+ File file = new File(fds[1]);
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ try {
+ String str = br.readLine();
+ while (str != null) {
+ String[] tokens = str.trim().split("\\s+");
+
+ oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[0]),
+ (float) Math.log(Float.parseFloat(tokens[1]))));
+
+ str = br.readLine();
+ }
+ br.close();
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ Collections.sort(oovList);
+
+ } else {
+ String[] tokens = fds[1].trim().split("\\s+");
+ if (tokens.length % 2 != 0) {
+ throw new RuntimeException(String.format("* FATAL: invalid format for '%s'", fds[0]));
+ }
+ oovList = new ArrayList<OOVItem>();
+
+ for (int i = 0; i < tokens.length; i += 2)
+ oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[i]),
+ (float) Math.log(Float.parseFloat(tokens[i + 1]))));
+
+ Collections.sort(oovList);
+ }
+
+ } else if (parameter.equals(normalize_key("lattice-decoding"))) {
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("segment-oovs"))) {
+ segment_oovs = true;
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("default-non-terminal"))) {
+ default_non_terminal = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
+ LOG.debug("default_non_terminal: {}", default_non_terminal);
+
+ } else if (parameter.equals(normalize_key("goal-symbol"))) {
+ goal_symbol = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
+ LOG.debug("goalSymbol: {}", goal_symbol);
+
+ } else if (parameter.equals(normalize_key("weights-file"))) {
+ weights_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("constrain_parse"))) {
+ constrain_parse = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("true_oovs_only"))) {
+ true_oovs_only = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("filter-grammar"))) {
+ filter_grammar = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("amortize"))) {
+ amortized_sorting = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_pos_labels"))) {
+ use_pos_labels = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
+ use_unique_nbest = Boolean.valueOf(fds[1]);
+ LOG.debug("use_unique_nbest: {}", use_unique_nbest);
+
+ } else if (parameter.equals(normalize_key("output-format"))) {
+ outputFormat = fds[1];
+ LOG.debug("output-format: {}", outputFormat);
+
+ } else if (parameter.equals(normalize_key("include_align_index"))) {
+ include_align_index = Boolean.valueOf(fds[1]);
+ LOG.debug("include_align_index: {}", include_align_index);
+
+ } else if (parameter.equals(normalize_key("top_n"))) {
+ topN = Integer.parseInt(fds[1]);
+ LOG.debug("topN: {}", topN);
+
+ } else if (parameter.equals(normalize_key("num_parallel_decoders"))
+ || parameter.equals(normalize_key("threads"))) {
+ num_parallel_decoders = Integer.parseInt(fds[1]);
+ if (num_parallel_decoders <= 0) {
+ throw new IllegalArgumentException(
+ "Must specify a positive number for num_parallel_decoders");
+ }
+ LOG.debug("num_parallel_decoders: {}", num_parallel_decoders);
+
+ } else if (parameter.equals(normalize_key("mark_oovs"))) {
+ mark_oovs = Boolean.valueOf(fds[1]);
+ LOG.debug("mark_oovs: {}", mark_oovs);
+
+ } else if (parameter.equals(normalize_key("pop-limit"))) {
+ pop_limit = Integer.parseInt(fds[1]);
+ LOG.info("pop-limit: {}", pop_limit);
+
+ } else if (parameter.equals(normalize_key("input-type"))) {
+ if (fds[1].equals("json")) {
+ input_type = INPUT_TYPE.json;
+ } else if (fds[1].equals("plain")) {
+ input_type = INPUT_TYPE.plain;
+ } else {
+ throw new RuntimeException(String.format("* FATAL: invalid server type '%s'", fds[1]));
+ }
+ LOG.info(" input-type: {}", input_type);
+
+ } else if (parameter.equals(normalize_key("server-type"))) {
+ if (fds[1].toLowerCase().equals("tcp"))
+ server_type = SERVER_TYPE.TCP;
+ else if (fds[1].toLowerCase().equals("http"))
+ server_type = SERVER_TYPE.HTTP;
+
+ LOG.info(" server-type: {}", server_type);
+
+ } else if (parameter.equals(normalize_key("server-port"))) {
+ server_port = Integer.parseInt(fds[1]);
+ LOG.info(" server-port: {}", server_port);
+
+ } else if (parameter.equals(normalize_key("rescore-forest"))) {
+ rescoreForest = true;
+ LOG.info(" rescore-forest: {}", rescoreForest);
+
+ } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
+ rescoreForestWeight = Float.parseFloat(fds[1]);
+ LOG.info(" rescore-forest-weight: {}", rescoreForestWeight);
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // reset the maximum length
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals("c") || parameter.equals("config")) {
+ // this was used to send in the config file, just ignore it
+ ;
+
+ } else if (parameter.equals(normalize_key("feature-function"))) {
+ // add the feature to the list of features for later processing
- features.add("feature_function = " + fds[1]);
++ features.add(fds[1]);
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // add the feature to the list of features for later processing
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter
+ .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
+ fuzzy_matching = Boolean.parseBoolean(fds[1]);
+ LOG.debug("fuzzy_matching: {}", fuzzy_matching);
+
+ } else if (parameter.equals(normalize_key("fragment-map"))) {
+ fragmentMapFile = fds[1];
+ Tree.readMapping(fragmentMapFile);
+
+ /** PHRASE-BASED PARAMETERS **/
+ } else if (parameter.equals(normalize_key("search"))) {
+ search_algorithm = fds[1];
+
+ if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
+ throw new RuntimeException(
+ "-search must be one of 'stack' (for phrase-based decoding) " +
+ "or 'cky' (for hierarchical / syntactic decoding)");
+ }
+
+ if (search_algorithm.equals("cky") && include_align_index) {
+ throw new RuntimeException(
+ "include_align_index is currently not supported with cky search");
+ }
+
+ } else if (parameter.equals(normalize_key("reordering-limit"))) {
+ reordering_limit = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("num-translation-options"))) {
+ num_translation_options = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("no-dot-chart"))) {
+ use_dot_chart = false;
+
+ } else if (parameter.equals(normalize_key("moses"))) {
+ moses = true; // triggers some Moses-specific compatibility options
+
+ } else if (parameter.equals(normalize_key("show-weights"))) {
+ show_weights_and_quit = true;
+
+ } else if (parameter.equals(normalize_key("n-best-list"))) {
+ // for Moses compatibility
+ String[] tokens = fds[1].split("\\s+");
+ n_best_file = tokens[0];
+ if (tokens.length > 1)
+ topN = Integer.parseInt(tokens[1]);
+
+ } else if (parameter.equals(normalize_key("input-file"))) {
+ // for Moses compatibility
+ input_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("weight-file"))) {
+ // for Moses, ignore
+
+ } else if (parameter.equals(normalize_key("weight-overwrite"))) {
+ weight_overwrite = fds[1];
+
+ } else if (parameter.equals(normalize_key("source-annotations"))) {
+ // Check source sentence
+ source_annotations = true;
+
+ } else if (parameter.equals(normalize_key("cached-rules-size"))) {
+ // Check source sentence
+ cachedRuleSize = Integer.parseInt(fds[1]);
+ } else if (parameter.equals(normalize_key("lowercase"))) {
+ lowercase = true;
+
+ } else if (parameter.equals(normalize_key("project-case"))) {
+ project_case = true;
+
+ } else {
+
+ if (parameter.equals(normalize_key("use-sent-specific-tm"))
+ || parameter.equals(normalize_key("add-combined-cost"))
+ || parameter.equals(normalize_key("use-tree-nbest"))
+ || parameter.equals(normalize_key("use-kenlm"))
+ || parameter.equals(normalize_key("useCubePrune"))
+ || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
+ || parameter.equals(normalize_key("regexp-grammar"))) {
+ LOG.warn("ignoring deprecated parameter '{}'", fds[0]);
+
+ } else {
+ throw new RuntimeException("FATAL: unknown configuration parameter '" + fds[0] + "'");
+ }
+ }
+
+ LOG.info(" {} = '{}'", normalize_key(fds[0]), fds[1]);
+
+ } else {
+ /*
+ * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
+ * are feature values, which can be present in this file
+ */
+
+ weights.add(line);
+ }
+ }
+ } finally {
+ configReader.close();
+ }
+ }
+
+ /**
+ * Checks for invalid variable configurations
+ */
+ public void sanityCheck() {
+ }
+
+ /**
+ * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
+ * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
+ * camelCasing in paramter names without forcing the user to memorize them all. Here are some
+ * examples of equivalent ways to refer to parameter names:
+ * <pre>
+ * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
+ * </pre>
+ *
+ * @param text the string to be normalized
+ * @return normalized key
+ *
+ */
+ public static String normalize_key(String text) {
+ return text.replaceAll("[-_]", "").toLowerCase();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 69584dd,0000000..e53e19f
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@@ -1,117 -1,0 +1,118 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+
+/**
+ * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
+ * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
+ * with respect to the translation model, we create a rule that pushes that word through
+ * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
+ * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
+ * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class OOVPenalty extends StatelessFF {
- private int ownerID = -1;
++ private final int ownerID;
+
+ /* The default value returned for OOVs. Can be overridden with -oov-list */
- private float defaultValue = -100f;
- private HashMap<Integer,Float> oovWeights = null;
++ private final float defaultValue = -100f;
++ private final HashMap<Integer,Float> oovWeights;
+
+ public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "OOVPenalty", args, config);
+
+ ownerID = Vocabulary.id("oov");
+ oovWeights = new HashMap<Integer,Float>();
+
- if (config.oovList != null)
- for (OOVItem item: config.oovList)
++ if (config.oovList != null) {
++ for (OOVItem item: config.oovList) {
+ oovWeights.put(Vocabulary.id(item.label), item.weight);
++ }
++ }
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
- ArrayList<String> names = new ArrayList<String>();
++ ArrayList<String> names = new ArrayList<>(1);
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
+ * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
+ * cached when the feature was created.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && this.ownerID == rule.getOwner()) {
- // acc.add(name, getValue(rule.getLHS()));
+ acc.add(denseFeatureIndex, getValue(rule.getLHS()));
+ }
+
+ return null;
+ }
+
+ /**
+ * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
+ * rules (which are added for all words, not just ones without translation options) get sorted
+ * to the bottom during cube pruning.
+ *
+ * Important! estimateCost returns the *weighted* feature value.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && this.ownerID == rule.getOwner())
+ return weights.getDense(denseFeatureIndex) * getValue(rule.getLHS());
+ return 0.0f;
+ }
+
+ private float getValue(int lhs) {
+ return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
+ }
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+}
[5/5] incubator-joshua git commit: Merge branch 'JOSHUA-PR21' into
JOSHUA-252
Posted by mj...@apache.org.
Merge branch 'JOSHUA-PR21' into JOSHUA-252
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8793c45d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8793c45d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8793c45d
Branch: refs/heads/JOSHUA-252
Commit: 8793c45d783c09db89c775536029092a8d322083
Parents: 9e70266 5c0d538
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 15:39:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 15:39:13 2016 -0400
----------------------------------------------------------------------
lib/ivy.xml | 17 +++
src/joshua/decoder/ff/LexicalFeatures.java | 131 +++++++++++++++++++
.../org/apache/joshua/corpus/Vocabulary.java | 13 +-
.../java/org/apache/joshua/decoder/Decoder.java | 17 ++-
.../joshua/decoder/JoshuaConfiguration.java | 10 +-
.../apache/joshua/decoder/ff/OOVPenalty.java | 15 ++-
.../org/apache/joshua/decoder/ff/RuleFF.java | 109 +++++++++------
.../apache/joshua/decoder/ff/RuleLength.java | 13 +-
.../org/apache/joshua/decoder/ff/RuleShape.java | 67 +++++++---
.../apache/joshua/decoder/ff/WordPenalty.java | 10 +-
.../lm/berkeley_lm/LMGrammarBerkeleyTest.java | 2 +-
.../system/MultithreadedTranslationTests.java | 2 +-
.../system/StructuredTranslationTest.java | 2 +-
13 files changed, 314 insertions(+), 94 deletions(-)
----------------------------------------------------------------------