You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/31 19:39:21 UTC

[1/5] incubator-joshua git commit: Added Sparse lexical feature function. Revised various other sparse feature functions to avoid String formatting. Expensive feature functions now use an LRU cache to avoid re-calculation of feature hashes for commonly u

Repository: incubator-joshua
Updated Branches:
  refs/heads/JOSHUA-252 9e7026665 -> 8793c45d7


Added Sparse lexical feature function. Revised various other sparse feature functions to avoid String formatting. Expensive feature functions now use an LRU cache to avoid re-calculation of feature hashes for commonly used rules. Also cleaned up the feature string parsing a little bit.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/25a92cbc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/25a92cbc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/25a92cbc

Branch: refs/heads/JOSHUA-252
Commit: 25a92cbca7c3a11c1d99c3e71686aea9874e0133
Parents: fadc285
Author: Felix Hieber <fh...@amazon.com>
Authored: Sat Apr 30 09:35:10 2016 -0700
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 11:44:51 2016 +0200

----------------------------------------------------------------------
 src/joshua/corpus/Vocabulary.java               |  13 +-
 src/joshua/decoder/Decoder.java                 |  29 ++--
 src/joshua/decoder/JoshuaConfiguration.java     |  10 +-
 src/joshua/decoder/ff/LexicalFeatures.java      | 131 +++++++++++++++++++
 src/joshua/decoder/ff/OOVPenalty.java           |  15 ++-
 src/joshua/decoder/ff/RuleFF.java               | 110 ++++++++++------
 src/joshua/decoder/ff/RuleLength.java           |  13 +-
 src/joshua/decoder/ff/RuleShape.java            |  66 +++++++---
 src/joshua/decoder/ff/WordPenalty.java          |  10 +-
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |   2 +-
 .../system/MultithreadedTranslationTests.java   |   2 +-
 .../system/StructuredTranslationTest.java       |   2 +-
 12 files changed, 301 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index 74f6a47..2193629 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -205,10 +205,17 @@ public class Vocabulary {
   }
 
   public static String getWords(int[] ids) {
-    if (ids.length == 0) return "";
+    return getWords(ids, " ");
+  }
+  
+  public static String getWords(int[] ids, final String separator) {
+    if (ids.length == 0) {
+      return "";
+    }
     StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < ids.length - 1; i++)
-      sb.append(word(ids[i])).append(" ");
+    for (int i = 0; i < ids.length - 1; i++) {
+      sb.append(word(ids[i])).append(separator);
+    }
     return sb.append(word(ids[ids.length - 1])).toString();
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 22ed8b9..97ac9aa 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -20,7 +20,7 @@ package joshua.decoder;
 
 import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
 
-import java.io.BufferedWriter;	
+import java.io.BufferedWriter;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
@@ -34,8 +34,6 @@ import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 
-import com.google.common.base.Strings;
-
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
@@ -59,6 +57,8 @@ import joshua.util.FormatUtils;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;
 
+import com.google.common.base.Strings;
+
 /**
  * This class handles decoder initialization and the complication introduced by multithreading.
  * 
@@ -914,7 +914,7 @@ public class Decoder {
    * Feature functions are instantiated with a line of the form
    * 
    * <pre>
-   *   feature_function = FEATURE OPTIONS
+   *   FEATURE OPTIONS
    * </pre>
    * 
    * Weights for features are listed separately.
@@ -926,31 +926,26 @@ public class Decoder {
   private void initializeFeatureFunctions() throws IOException {
 
     for (String featureLine : joshuaConfiguration.features) {
-      // feature-function = NAME args
+      // line starts with NAME, followed by args
       // 1. create new class named NAME, pass it config, weights, and the args
 
-      // Get rid of the leading crap.
-      featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");
-
       String fields[] = featureLine.split("\\s+");
       String featureName = fields[0];
+      
       try {
+        
         Class<?> clas = getClass(featureName);
         Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
             String[].class, JoshuaConfiguration.class);
-        this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
+        FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
+        Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
+        this.featureFunctions.add(feature);
+        
       } catch (Exception e) {
-        e.printStackTrace();
-        System.err.println("* FATAL: could not find a feature '" + featureName + "'");
-        System.exit(1);
+        throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e); 
       }
     }
 
-    for (FeatureFunction feature : featureFunctions) {
-      Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
-      
-    }
-
     weights.registerDenseFeatures(featureFunctions);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c874534..05197e5 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -390,21 +390,21 @@ public class JoshuaConfiguration {
              * 
              * LMs are now loaded as general feature functions, so we transform that to either
              * 
-             *   feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+             *   LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
              * 
              * If the line were state minimizing:
              * 
              *   lm = kenlm 5 true false 100 lm.gz
              *              
-             * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+             * StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
              */
             
             String[] tokens = fds[1].split("\\s+");
             if (tokens[2].equals("true"))
-              features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+              features.add(String.format("StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
                   tokens[1], tokens[5]));
             else
-              features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+              features.add(String.format("LanguageModel -lm_type %s -lm_order %s -lm_file %s",
                   tokens[0], tokens[1], tokens[5]));
 
           } else if (parameter.equals(normalize_key("tm"))) {
@@ -582,7 +582,7 @@ public class JoshuaConfiguration {
 
           } else if (parameter.equals(normalize_key("feature-function"))) {
             // add the feature to the list of features for later processing
-            features.add("feature_function = " + fds[1]);
+            features.add(fds[1]);
 
           } else if (parameter.equals(normalize_key("maxlen"))) {
             // add the feature to the list of features for later processing

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/LexicalFeatures.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalFeatures.java b/src/joshua/decoder/ff/LexicalFeatures.java
new file mode 100644
index 0000000..128df87
--- /dev/null
+++ b/src/joshua/decoder/ff/LexicalFeatures.java
@@ -0,0 +1,131 @@
+package joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.chart_parser.SourcePath;
+import joshua.decoder.ff.state_maintenance.DPState;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.HGNode;
+import joshua.decoder.segment_file.Sentence;
+
+import com.google.common.cache.Cache;
+
+/**
+ *  Lexical alignment features denoting alignments, deletions, and insertions.
+ */
+public class LexicalFeatures extends StatelessFF {
+  
+  private final boolean useAlignments;
+  private final boolean useDeletions;
+  private final boolean useInsertions;
+  
+  private static final String NAME = "LexicalFeatures";
+  // value to fire for features
+  private static final int VALUE = 1;
+  //whether this feature is restricted to a certain grammar/owner
+  private final boolean ownerRestriction;
+  // the grammar/owner this feature is restricted to fire
+  private final int owner;
+  // Strings separating words
+  private static final String SEPARATOR = "~";
+  
+  private final Cache<Rule, List<String>> featureCache;
+  
+  public LexicalFeatures(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+    super(weights, NAME, args, config);
+    
+    ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+    
+    useAlignments = parsedArgs.containsKey("alignments");
+    useDeletions = parsedArgs.containsKey("deletions");
+    useInsertions = parsedArgs.containsKey("insertions");
+    
+    // initialize cache
+    if (parsedArgs.containsKey("cacheSize")) {
+      featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+    } else {
+      featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+    }
+  }
+
+  @Override
+  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+      Sentence sentence, Accumulator acc) {
+    
+    if (ownerRestriction && rule.getOwner() != owner) {
+      return null;
+    }
+
+    List<String> featureNames = featureCache.getIfPresent(rule);
+    if (featureNames == null) {
+      featureNames = getFeatures(rule);
+      featureCache.put(rule, featureNames);
+    }
+    for (String feature : featureNames) {
+      acc.add(feature, VALUE);
+    }
+    
+    return null;
+  }
+  
+  /**
+   * Obtains the feature ids for the given rule.
+   * @param rule
+   * @return String representing the feature name.s
+   */
+  private List<String> getFeatures(final Rule rule) {
+    final List<String> result = new ArrayList<>();
+    
+    byte[] alignments = rule.getAlignment();
+    if (alignments == null) {
+      return result;
+    }
+    int[] sourceWords = rule.getFrench();
+    int[] targetWords = rule.getEnglish();
+    
+    // sourceAligned & targetAligned indicate whether an index is covered by alignments
+    boolean[] sourceAligned = new boolean[sourceWords.length];
+    boolean[] targetAligned = new boolean[targetWords.length];
+    
+    // translations: aligned words
+    for (int i = 0; i < alignments.length; i+=2) {
+      byte sourceIndex = alignments[i];
+      byte targetIndex = alignments[i + 1];
+      sourceAligned[sourceIndex] = true;
+      targetAligned[targetIndex] = true;
+      if (useAlignments) {
+        result.add(
+            "T:" + 
+            Vocabulary.word(sourceWords[sourceIndex]) + 
+            SEPARATOR + 
+            Vocabulary.word(targetWords[targetIndex]));
+      }
+    }
+    
+    // deletions: unaligned source words
+    if (useDeletions) {
+      for (int i = 0; i < sourceAligned.length; i++) {
+        if (!sourceAligned[i] && !Vocabulary.nt(sourceWords[i])) {
+          result.add("D:" + Vocabulary.word(sourceWords[i]));
+        }
+      }
+    }
+    
+    // insertions: unaligned target words
+    if (useInsertions) {
+      for (int i = 0; i < targetAligned.length; i++) {
+        if (useInsertions && !targetAligned[i] && !Vocabulary.nt(targetWords[i])) {
+          result.add("I:" + Vocabulary.word(targetWords[i]));
+        }
+      }
+    }
+    
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/OOVPenalty.java b/src/joshua/decoder/ff/OOVPenalty.java
index 6a06548..47a83ef 100644
--- a/src/joshua/decoder/ff/OOVPenalty.java
+++ b/src/joshua/decoder/ff/OOVPenalty.java
@@ -42,11 +42,11 @@ import joshua.decoder.chart_parser.SourcePath;
  * @author Matt Post <po...@cs.jhu.edu>
  */
 public class OOVPenalty extends StatelessFF {
-  private int ownerID = -1;
+  private final int ownerID;
   
   /* The default value returned for OOVs. Can be overridden with -oov-list */
-  private float defaultValue = -100f;
-  private HashMap<Integer,Float> oovWeights = null;
+  private final float defaultValue = -100f;
+  private final HashMap<Integer,Float> oovWeights;
 
   public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "OOVPenalty", args, config);
@@ -54,16 +54,18 @@ public class OOVPenalty extends StatelessFF {
     ownerID = Vocabulary.id("oov");
     oovWeights = new HashMap<Integer,Float>();
     
-    if (config.oovList != null)
-      for (OOVItem item: config.oovList) 
+    if (config.oovList != null) {
+      for (OOVItem item: config.oovList) { 
         oovWeights.put(Vocabulary.id(item.label), item.weight);
+      }
+    }
   }
   
   @Override
   public ArrayList<String> reportDenseFeatures(int index) {
     denseFeatureIndex = index;
     
-    ArrayList<String> names = new ArrayList<String>();
+    ArrayList<String> names = new ArrayList<>(1);
     names.add(name);
     return names;
   }
@@ -78,7 +80,6 @@ public class OOVPenalty extends StatelessFF {
       Sentence sentence, Accumulator acc) {
     
     if (rule != null && this.ownerID == rule.getOwner()) {
-//      acc.add(name, getValue(rule.getLHS()));
       acc.add(denseFeatureIndex, getValue(rule.getLHS()));
     }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleFF.java b/src/joshua/decoder/ff/RuleFF.java
index 9fb7d3e..48e4340 100644
--- a/src/joshua/decoder/ff/RuleFF.java
+++ b/src/joshua/decoder/ff/RuleFF.java
@@ -18,6 +18,9 @@
  */
 package joshua.decoder.ff;
 
+import static com.google.common.cache.CacheBuilder.newBuilder;
+import static joshua.corpus.Vocabulary.getWords;
+
 import java.util.List;
 
 import joshua.corpus.Vocabulary;
@@ -28,61 +31,94 @@ import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.hypergraph.HGNode;
 import joshua.decoder.segment_file.Sentence;
 
+import com.google.common.cache.Cache;
+
 /**
- *  This feature just counts rules that are used. You can restrict it with a number of flags:
- * 
- *   -owner OWNER
- *    Only count rules owned by OWNER
- *   -target|-source
- *    Only count the target or source side (plus the LHS)
- *
- * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold. 
+ *  This feature fires for rule ids.
+ *  Firing can be restricted to rules from a certain owner, and rule ids
+ *  can be generated from source side and/or target side. 
  */
 public class RuleFF extends StatelessFF {
 
   private enum Sides { SOURCE, TARGET, BOTH };
   
-  private int owner = 0;
-  private Sides sides = Sides.BOTH;
+  private static final String NAME = "RuleFF";
+  // value to fire for features
+  private static final int VALUE = 1;
+  // whether this feature is restricted to a certain grammar/owner
+  private final boolean ownerRestriction;
+  // the grammar/owner this feature is restricted to fire
+  private final int owner;
+  // what part of the rule should be extracted;
+  private final Sides sides;
+  // Strings separating words and rule sides 
+  private static final String SEPARATOR = "~";
+  private static final String SIDES_SEPARATOR = "->";
+  
+  private final Cache<Rule, String> featureCache;
   
   public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
-    super(weights, "RuleFF", args, config);
+    super(weights, NAME, args, config);
+    
+    ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
     
-    owner = Vocabulary.id(parsedArgs.get("owner"));
-    if (parsedArgs.containsKey("source"))
-      sides = Sides.SOURCE;
-    else if (parsedArgs.containsKey("target"))
-      sides = Sides.TARGET;
+    if (parsedArgs.containsKey("sides")) {
+      final String sideValue = parsedArgs.get("sides");
+      if (sideValue.equalsIgnoreCase("source")) {
+        sides = Sides.SOURCE;
+      } else if (sideValue.equalsIgnoreCase("target")) {
+        sides = Sides.TARGET;
+      } else if (sideValue.equalsIgnoreCase("both")){
+        sides = Sides.BOTH;
+      } else {
+        throw new RuntimeException("Unknown side value.");
+      }
+    } else {
+      sides = Sides.BOTH;
+    }
+    
+    // initialize cache
+    if (parsedArgs.containsKey("cacheSize")) {
+      featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+    } else {
+      featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+    }
   }
 
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-
-    if (owner > 0 && rule.getOwner() == owner) {
-      String ruleString = getRuleString(rule);
-      acc.add(ruleString, 1);
+    
+    if (ownerRestriction && rule.getOwner() != owner) {
+      return null;
     }
 
+    String featureName = featureCache.getIfPresent(rule);
+    if (featureName == null) {
+      featureName = getRuleString(rule);
+      featureCache.put(rule, featureName);
+    }
+    acc.add(featureName, VALUE);
+    
     return null;
   }
-
-  private String getRuleString(Rule rule) {
-    String ruleString = "";
-    switch(sides) {
-    case BOTH:
-      ruleString = String.format("%s  %s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords(),
-          rule.getEnglishWords());
-      break;
-
-    case SOURCE:
-      ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords());
-      break;
-
-    case TARGET:
-      ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getEnglishWords());
-      break;
+  
+  /**
+   * Obtains the feature id for the given rule.
+   * @param rule
+   * @return String representing the feature name.s
+   */
+  private String getRuleString(final Rule rule) {
+    final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+      .append(SIDES_SEPARATOR);
+    if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+      sb.append(getWords(rule.getFrench(), SEPARATOR));
+    }
+    sb.append(SIDES_SEPARATOR);
+    if (sides == Sides.TARGET || sides == Sides.BOTH) {
+      sb.append(getWords(rule.getEnglish(), SEPARATOR));
     }
-    return ruleString.replaceAll("[ =]", "~");
+    return sb.toString();
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleLength.java b/src/joshua/decoder/ff/RuleLength.java
index 645905a..ce02282 100644
--- a/src/joshua/decoder/ff/RuleLength.java
+++ b/src/joshua/decoder/ff/RuleLength.java
@@ -32,6 +32,8 @@ import joshua.decoder.segment_file.Sentence;
  * source side, its target side, and a feature that pairs them.
  */
 public class RuleLength extends StatelessFF {
+  
+  private static final int VALUE = 1;
 
   public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "RuleLength", args, config);
@@ -40,12 +42,11 @@ public class RuleLength extends StatelessFF {
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-    int sourceLen = rule.getFrench().length;
-    int targetLen = rule.getEnglish().length;
-    acc.add(String.format("%s_sourceLength%d", name, sourceLen), 1);
-    acc.add(String.format("%s_targetLength%d", name, targetLen), 1);
-    acc.add(String.format("%s_pairLength%d-%d", name, sourceLen, targetLen), 1);
-
+    int sourceLength = rule.getFrench().length;
+    int targetLength = rule.getEnglish().length;
+    acc.add(name + "_source" + sourceLength, VALUE);
+    acc.add(name + "_target" + sourceLength, VALUE);
+    acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
     return null;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleShape.java b/src/joshua/decoder/ff/RuleShape.java
index e243528..3bd10a8 100644
--- a/src/joshua/decoder/ff/RuleShape.java
+++ b/src/joshua/decoder/ff/RuleShape.java
@@ -20,6 +20,7 @@ package joshua.decoder.ff;
 
 import java.util.List;
 
+import joshua.corpus.Vocabulary;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.chart_parser.SourcePath;
 import joshua.decoder.ff.state_maintenance.DPState;
@@ -36,38 +37,63 @@ public class RuleShape extends StatelessFF {
     super(weights, "RuleShape", args, config);
   }
 
-  private int gettype(int id) {
-    if (id < 0)
-      return -1;
-    return 1;
+  private enum WordType {
+    N("N"), T("x"), P("+");
+    private final String string;
+    private boolean repeats;
+
+    private WordType(final String string) {
+      this.string = string;
+      this.repeats = false;
+    }
+    
+    private void setRepeats() {
+      repeats = true;
+    }
+
+    @Override
+    public String toString() {
+      if (repeats) {
+        return this.string + "+";
+      }
+      return this.string;
+    }
+  }
+
+  private WordType getWordType(int id) {
+    if (Vocabulary.nt(id)) {
+      return WordType.N;
+    } else {
+      return WordType.T;
+    }
   }
   
-  private String pattern(int[] ids) {
-    StringBuilder pattern = new StringBuilder();
-    int curtype = gettype(ids[0]);
-    int curcount = 1;
+  /**
+   * Returns a String describing the rule pattern.
+   */
+  private String getRulePattern(int[] ids) {
+    final StringBuilder pattern = new StringBuilder();
+    WordType currentType = getWordType(ids[0]);
     for (int i = 1; i < ids.length; i++) {
-      if (gettype(ids[i]) != curtype) {
-        pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
-        curtype = gettype(ids[i]);
-        curcount = 1;
+      if (getWordType(ids[i]) != currentType) {
+        pattern.append(currentType.toString());
+        currentType = getWordType(ids[i]);
       } else {
-        curcount++;
+        currentType.setRepeats();
       }
     }
-    pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
+    pattern.append(currentType.toString());
     return pattern.toString();
   }
   
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-    String sourceShape = pattern(rule.getFrench());
-    String targetShape = pattern(rule.getEnglish());
-    acc.add(String.format("%s_source_%s", name, sourceShape), 1);
-    acc.add(String.format("%s_target_%s", name, targetShape), 1);
-    acc.add(String.format("%s_both_%s__%s", name, sourceShape, targetShape), 1);
-
+    final String sourceShape = getRulePattern(rule.getFrench());
+    final String targetShape = getRulePattern(rule.getEnglish());
+    acc.add(name + "_source_" + sourceShape, 1);
+    acc.add(name + "_target_" + sourceShape, 1);
+    acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
     return null;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/WordPenalty.java b/src/joshua/decoder/ff/WordPenalty.java
index 583b59c..d72a4e6 100644
--- a/src/joshua/decoder/ff/WordPenalty.java
+++ b/src/joshua/decoder/ff/WordPenalty.java
@@ -37,12 +37,15 @@ import joshua.decoder.segment_file.Sentence;
 public final class WordPenalty extends StatelessFF {
 
   private float OMEGA = -(float) Math.log10(Math.E); // -0.435
+  private final boolean isCky;
 
   public WordPenalty(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "WordPenalty", args, config);
 
     if (parsedArgs.containsKey("value"))
       OMEGA = Float.parseFloat(parsedArgs.get("value"));
+    
+    isCky = config.search_algorithm.equals("cky");
   }
 
   @Override
@@ -52,10 +55,9 @@ public final class WordPenalty extends StatelessFF {
     if (rule != null) {
       // TODO: this is an inefficient way to do this. Find a better way to not apply this rule
       // to start and stop glue rules when phrase-based decoding.
-      if (config.search_algorithm.equals("cky") 
-          || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE))
-        // acc.add(name, OMEGA * (rule.getEnglish().length - rule.getArity()));
+      if (isCky || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE)) {
         acc.add(denseFeatureIndex, OMEGA * (rule.getEnglish().length - rule.getArity()));
+      }
     }
       
     return null;
@@ -64,7 +66,7 @@ public final class WordPenalty extends StatelessFF {
   @Override
   public ArrayList<String> reportDenseFeatures(int index) {
     denseFeatureIndex = index;
-    ArrayList<String> names = new ArrayList<String>();
+    ArrayList<String> names = new ArrayList<>(1);
     names.add(name);
     return names;
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index 6e0d90f..0a29646 100644
--- a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -49,7 +49,7 @@ public class LMGrammarBerkeleyTest {
   public void verifyLM() {
     joshuaConfig = new JoshuaConfiguration();
     joshuaConfig.processCommandLineOptions(OPTIONS);
-    joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+    joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
     decoder = new Decoder(joshuaConfig, null);
     String translation = decode(INPUT).toString();
     assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java
index f438ccd..220bced 100644
--- a/tst/joshua/system/MultithreadedTranslationTests.java
+++ b/tst/joshua/system/MultithreadedTranslationTests.java
@@ -64,7 +64,7 @@ public class MultithreadedTranslationTests {
     joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
     joshuaConfig.goal_symbol = "[GOAL]";
     joshuaConfig.default_non_terminal = "[X]";
-    joshuaConfig.features.add("feature_function = OOVPenalty");
+    joshuaConfig.features.add("OOVPenalty");
     joshuaConfig.weights.add("tm_pt_0 1");
     joshuaConfig.weights.add("tm_pt_1 1");
     joshuaConfig.weights.add("tm_pt_2 1");

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredTranslationTest.java b/tst/joshua/system/StructuredTranslationTest.java
index 0608a65..249eabf 100644
--- a/tst/joshua/system/StructuredTranslationTest.java
+++ b/tst/joshua/system/StructuredTranslationTest.java
@@ -85,7 +85,7 @@ public class StructuredTranslationTest {
     joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
     joshuaConfig.goal_symbol = "[GOAL]";
     joshuaConfig.default_non_terminal = "[X]";
-    joshuaConfig.features.add("feature_function = OOVPenalty");
+    joshuaConfig.features.add("OOVPenalty");
     joshuaConfig.weights.add("tm_pt_0 1");
     joshuaConfig.weights.add("tm_pt_1 1");
     joshuaConfig.weights.add("tm_pt_2 1");


[2/5] incubator-joshua git commit: revert change to ivy.xml

Posted by mj...@apache.org.
revert change to ivy.xml


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5591c676
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5591c676
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5591c676

Branch: refs/heads/JOSHUA-252
Commit: 5591c6769c162e3268243aa3324c367c6ba9c945
Parents: 25a92cb
Author: Felix Hieber <fh...@amazon.com>
Authored: Mon May 30 11:54:53 2016 +0200
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 11:56:00 2016 +0200

----------------------------------------------------------------------
 lib/ivy.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5591c676/lib/ivy.xml
----------------------------------------------------------------------
diff --git a/lib/ivy.xml b/lib/ivy.xml
index 66034c6..d41595d 100644
--- a/lib/ivy.xml
+++ b/lib/ivy.xml
@@ -1,11 +1,12 @@
 <ivy-module version="2.0">
   <info organisation="joshua" module="joshua"/>
   <dependencies>
+    <dependency org="net.sourceforge.ant-doxygen" name="ant-doxygen" rev="1.6.1" />
     <dependency org="net.sf.jung" name="jung-algorithms" rev="2.0"/>
     <dependency org="net.sf.jung" name="jung-api" rev="2.0"/>
     <dependency org="net.sf.jung" name="jung-graph-impl" rev="2.0"/>
     <dependency org="net.sf.jung" name="jung-visualization" rev="2.0"/>
-    <dependency org="org.apache.commons" name="commons-cli" rev="1.3.1"/>
+    <dependency org="org.apache.commons" name="commons-cli" rev="1.2"/>
     <dependency org="org.testng" name="testng" rev="6.7"/>
     <dependency org="junit"  name="junit" rev="4.10" />
     <dependency org="net.sourceforge.collections" name="collections-generic" rev="4.01"/>


[3/5] incubator-joshua git commit: Merge branch 'sparse' of https://github.com/fhieber/incubator-joshua into JOSHUA-PR21

Posted by mj...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index bc6d67b,0000000..20f91ee
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@@ -1,100 -1,0 +1,135 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff;
 +
++import static com.google.common.cache.CacheBuilder.newBuilder;
++
 +import java.util.List;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.chart_parser.SourcePath;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
++import com.google.common.cache.Cache;
++
 +/**
-  *  This feature just counts rules that are used. You can restrict it with a number of flags:
-  * 
-  *   -owner OWNER
-  *    Only count rules owned by OWNER
-  *   -target|-source
-  *    Only count the target or source side (plus the LHS)
-  *
-  * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold. 
++ *  This feature fires for rule ids.
++ *  Firing can be restricted to rules from a certain owner, and rule ids
++ *  can be generated from source side and/or target side. 
 + */
 +public class RuleFF extends StatelessFF {
 +
 +  private enum Sides { SOURCE, TARGET, BOTH };
 +  
-   private int owner = 0;
-   private Sides sides = Sides.BOTH;
++  private static final String NAME = "RuleFF";
++  // value to fire for features
++  private static final int VALUE = 1;
++  // whether this feature is restricted to a certain grammar/owner
++  private final boolean ownerRestriction;
++  // the grammar/owner this feature is restricted to fire
++  private final int owner;
++  // what part of the rule should be extracted;
++  private final Sides sides;
++  // Strings separating words and rule sides 
++  private static final String SEPARATOR = "~";
++  private static final String SIDES_SEPARATOR = "->";
++  
++  private final Cache<Rule, String> featureCache;
 +  
 +  public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
-     super(weights, "RuleFF", args, config);
++    super(weights, NAME, args, config);
++    
++    ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
++    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
 +    
-     owner = Vocabulary.id(parsedArgs.get("owner"));
-     if (parsedArgs.containsKey("source"))
-       sides = Sides.SOURCE;
-     else if (parsedArgs.containsKey("target"))
-       sides = Sides.TARGET;
++    if (parsedArgs.containsKey("sides")) {
++      final String sideValue = parsedArgs.get("sides");
++      if (sideValue.equalsIgnoreCase("source")) {
++        sides = Sides.SOURCE;
++      } else if (sideValue.equalsIgnoreCase("target")) {
++        sides = Sides.TARGET;
++      } else if (sideValue.equalsIgnoreCase("both")){
++        sides = Sides.BOTH;
++      } else {
++        throw new RuntimeException("Unknown side value.");
++      }
++    } else {
++      sides = Sides.BOTH;
++    }
++    
++    // initialize cache
++    if (parsedArgs.containsKey("cacheSize")) {
++      featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
++    } else {
++      featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
++    }
 +  }
 +
 +  @Override
 +  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
 +      Sentence sentence, Accumulator acc) {
- 
-     if (owner > 0 && rule.getOwner() == owner) {
-       String ruleString = getRuleString(rule);
-       acc.add(ruleString, 1);
++    
++    if (ownerRestriction && rule.getOwner() != owner) {
++      return null;
 +    }
 +
++    String featureName = featureCache.getIfPresent(rule);
++    if (featureName == null) {
++      featureName = getRuleString(rule);
++      featureCache.put(rule, featureName);
++    }
++    acc.add(featureName, VALUE);
++    
 +    return null;
 +  }
- 
-   private String getRuleString(Rule rule) {
-     String ruleString = "";
-     switch(sides) {
-     case BOTH:
-       ruleString = String.format("%s  %s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords(),
-           rule.getEnglishWords());
-       break;
- 
-     case SOURCE:
-       ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords());
-       break;
- 
-     case TARGET:
-       ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getEnglishWords());
-       break;
++  
++  /**
++   * Obtains the feature id for the given rule.
++   * @param rule
++   * @return String representing the feature name.s
++   */
++  private String getRuleString(final Rule rule) {
++    final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
++      .append(SIDES_SEPARATOR);
++    if (sides == Sides.SOURCE || sides == Sides.BOTH) {
++      sb.append(Vocabulary.getWords(rule.getFrench(), SEPARATOR));
++    }
++    sb.append(SIDES_SEPARATOR);
++    if (sides == Sides.TARGET || sides == Sides.BOTH) {
++      sb.append(Vocabulary.getWords(rule.getEnglish(), SEPARATOR));
 +    }
-     return ruleString.replaceAll("[ =]", "~");
++    return sb.toString();
 +  }
 +
 +  @Override
 +  public double estimateLogP(Rule rule, int sentID) {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +
 +  @Override
 +  public double getWeight() {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
index 59b1c20,0000000..02c520b
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
@@@ -1,51 -1,0 +1,52 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff;
 +
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.chart_parser.SourcePath;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +/*
 + * This feature computes three feature templates: a feature indicating the length of the rule's
 + * source side, its target side, and a feature that pairs them.
 + */
 +public abstract class RuleLength extends StatelessFF {
++  
++  private static final int VALUE = 1;
 +
 +  public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
 +    super(weights, "RuleLength", args, config);
 +  }
 +
 +  @Override
 +  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
 +      Sentence sentence, Accumulator acc) {
-     int sourceLen = rule.getFrench().length;
-     int targetLen = rule.getEnglish().length;
-     acc.add(String.format("%s_sourceLength%d", name, sourceLen), 1);
-     acc.add(String.format("%s_targetLength%d", name, targetLen), 1);
-     acc.add(String.format("%s_pairLength%d-%d", name, sourceLen, targetLen), 1);
- 
++    int sourceLength = rule.getFrench().length;
++    int targetLength = rule.getEnglish().length;
++    acc.add(name + "_source" + sourceLength, VALUE);
++    acc.add(name + "_target" + sourceLength, VALUE);
++    acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
 +    return null;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
index a514021,0000000..6333701
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
@@@ -1,85 -1,0 +1,112 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff;
 +
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.chart_parser.SourcePath;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.segment_file.Sentence;
++import org.apache.joshua.util.FormatUtils;
++import org.apache.joshua.corpus.Vocabulary;
 +
 +/*
 + * Implements the RuleShape feature for source, target, and paired source+target sides.
 + */
 +public class RuleShape extends StatelessFF {
 +
 +  public RuleShape(FeatureVector weights, String[] args, JoshuaConfiguration config) {
 +    super(weights, "RuleShape", args, config);
 +  }
 +
-   private int gettype(int id) {
-     if (id < 0)
-       return -1;
-     return 1;
++  private enum WordType {
++    N("N"), T("x"), P("+");
++    private final String string;
++    private boolean repeats;
++
++    private WordType(final String string) {
++      this.string = string;
++      this.repeats = false;
++    }
++    
++    private void setRepeats() {
++      repeats = true;
++    }
++
++    @Override
++    public String toString() {
++      if (repeats) {
++        return this.string + "+";
++      }
++      return this.string;
++    }
++  }
++
++  private WordType getWordType(int id) {
++    if (FormatUtils.isNonterminal(id)) {
++      return WordType.N;
++    } else {
++      return WordType.T;
++    }
 +  }
 +  
-   private String pattern(int[] ids) {
-     StringBuilder pattern = new StringBuilder();
-     int curtype = gettype(ids[0]);
-     int curcount = 1;
++  /**
++   * Returns a String describing the rule pattern.
++   */
++  private String getRulePattern(int[] ids) {
++    final StringBuilder pattern = new StringBuilder();
++    WordType currentType = getWordType(ids[0]);
 +    for (int i = 1; i < ids.length; i++) {
-       if (gettype(ids[i]) != curtype) {
-         pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
-         curtype = gettype(ids[i]);
-         curcount = 1;
++      if (getWordType(ids[i]) != currentType) {
++        pattern.append(currentType.toString());
++        currentType = getWordType(ids[i]);
 +      } else {
-         curcount++;
++        currentType.setRepeats();
 +      }
 +    }
-     pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
++    pattern.append(currentType.toString());
 +    return pattern.toString();
 +  }
 +  
 +  @Override
 +  public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
 +      Sentence sentence, Accumulator acc) {
-     String sourceShape = pattern(rule.getFrench());
-     String targetShape = pattern(rule.getEnglish());
-     acc.add(String.format("%s_source_%s", name, sourceShape), 1);
-     acc.add(String.format("%s_target_%s", name, targetShape), 1);
-     acc.add(String.format("%s_both_%s__%s", name, sourceShape, targetShape), 1);
- 
++    final String sourceShape = getRulePattern(rule.getFrench());
++    final String targetShape = getRulePattern(rule.getEnglish());
++    acc.add(name + "_source_" + sourceShape, 1);
++    acc.add(name + "_target_" + sourceShape, 1);
++    acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
 +    return null;
 +  }
 +
 +  @Override
 +  public double estimateLogP(Rule rule, int sentID) {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +
 +  @Override
 +  public double getWeight() {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
index 62c889f,0000000..e1f74c2
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
@@@ -1,90 -1,0 +1,92 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff;
 +
 +import java.util.ArrayList;
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.chart_parser.SourcePath;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.phrase.Hypothesis;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +/**
 + * 
 + * @author Zhifei Li zhifei.work@gmail.com
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public final class WordPenalty extends StatelessFF {
 +
 +  private float OMEGA = -(float) Math.log10(Math.E); // -0.435
++  private final boolean isCky;
 +
 +  public WordPenalty(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
 +    super(weights, "WordPenalty", args, config);
 +
 +    if (parsedArgs.containsKey("value"))
 +      OMEGA = Float.parseFloat(parsedArgs.get("value"));
++    
++    isCky = config.search_algorithm.equals("cky");
 +  }
 +
 +  @Override
 +  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
 +      Sentence sentence, Accumulator acc) {
 +
 +    if (rule != null) {
 +      // TODO: this is an inefficient way to do this. Find a better way to not apply this rule
 +      // to start and stop glue rules when phrase-based decoding.
-       if (config.search_algorithm.equals("cky") 
-           || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE))
-         // acc.add(name, OMEGA * (rule.getEnglish().length - rule.getArity()));
++      if (isCky || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE)) {
 +        acc.add(denseFeatureIndex, OMEGA * (rule.getEnglish().length - rule.getArity()));
++      }
 +    }
 +      
 +    return null;
 +  }
 +
 +  @Override
 +  public ArrayList<String> reportDenseFeatures(int index) {
 +    denseFeatureIndex = index;
-     ArrayList<String> names = new ArrayList<String>();
++    ArrayList<String> names = new ArrayList<>(1);
 +    names.add(name);
 +    return names;
 +  }
 +
 +  @Override
 +  public float estimateCost(Rule rule, Sentence sentence) {
 +    if (rule != null)
 +      return weights.getDense(denseFeatureIndex) * OMEGA * (rule.getEnglish().length - rule.getArity());
 +    return 0.0f;
 +  }
 +
 +  @Override
 +  public double estimateLogP(Rule rule, int sentID) {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +
 +  @Override
 +  public double getWeight() {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index df73136,0000000..00a6a36
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@@ -1,79 -1,0 +1,79 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.lm.berkeley_lm;
 +
 +import static org.junit.Assert.assertEquals;
 +
 +import java.util.Arrays;
 +import java.util.List;
 +
 +import org.junit.After;
 +import org.junit.Test;
 +import org.junit.runner.RunWith;
 +import org.junit.runners.Parameterized;
 +import org.junit.runners.Parameterized.Parameters;
 +
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.Translation;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +/**
 + * Replacement for test/lm/berkeley/test.sh regression test
 + */
 +@RunWith(value = Parameterized.class)
 +public class LMGrammarBerkeleyTest {
 +
 +  private static final String INPUT = "the chat-rooms";
 +  private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");
 +  
 +  private JoshuaConfiguration joshuaConfig;
 +  private Decoder decoder;
 +  
 +  @Parameters
 +  public static List<String> lmFiles() {
 +    return Arrays.asList("resources/berkeley_lm/lm", 
 +        "resources/berkeley_lm/lm.gz", 
 +        "resources/berkeley_lm/lm.berkeleylm", 
 +        "resources/berkeley_lm/lm.berkeleylm.gz");
 +  }
 +  
 +  @After
 +  public void tearDown() throws Exception {
 +    decoder.cleanUp();
 +  }
 +  
 +  //TODO @Parameters
 +  public String lmFile;
 +  
 +  @Test
 +  public void verifyLM() {
 +    joshuaConfig = new JoshuaConfiguration();
 +    joshuaConfig.processCommandLineOptions(OPTIONS);
-     joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
++    joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
 +    decoder = new Decoder(joshuaConfig, null);
 +    String translation = decode(INPUT).toString();
 +    assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);
 +  }
 +  
 +  private Translation decode(String input) {
 +    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
 +    return decoder.decode(sentence);
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
index f006363,0000000..c760586
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
+++ b/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
@@@ -1,164 -1,0 +1,164 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 + package org.apache.joshua.system;
 +
 +import static org.junit.Assert.assertTrue;
 +
 +import java.io.BufferedReader;
 +import java.io.ByteArrayInputStream;
 +import java.io.ByteArrayOutputStream;
 +import java.io.IOException;
 +import java.io.InputStreamReader;
 +import java.nio.charset.Charset;
 +import java.util.ArrayList;
 +
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.MetaDataException;
 +import org.apache.joshua.decoder.io.TranslationRequestStream;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +import org.junit.After;
 +import org.junit.Before;
 +import org.junit.Test;
 +
 +/**
 + * Integration test for multithreaded Joshua decoder tests. Grammar used is a
 + * toy packed grammar.
 + *
 + * @author kellens
 + */
 +public class MultithreadedTranslationTests {
 +
 +  private JoshuaConfiguration joshuaConfig = null;
 +  private Decoder decoder = null;
 +  private static final String INPUT = "A K B1 U Z1 Z2 B2 C";
 +  private int previousLogLevel;
 +  private final static long NANO_SECONDS_PER_SECOND = 1_000_000_000;
 +
 +  @Before
 +  public void setUp() throws Exception {
 +    joshuaConfig = new JoshuaConfiguration();
 +    joshuaConfig.search_algorithm = "cky";
 +    joshuaConfig.mark_oovs = false;
 +    joshuaConfig.pop_limit = 100;
 +    joshuaConfig.use_unique_nbest = false;
 +    joshuaConfig.include_align_index = false;
 +    joshuaConfig.topN = 0;
 +    joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar.packed");
 +    joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
 +    joshuaConfig.goal_symbol = "[GOAL]";
 +    joshuaConfig.default_non_terminal = "[X]";
-     joshuaConfig.features.add("feature_function = OOVPenalty");
++    joshuaConfig.features.add("OOVPenalty");
 +    joshuaConfig.weights.add("tm_pt_0 1");
 +    joshuaConfig.weights.add("tm_pt_1 1");
 +    joshuaConfig.weights.add("tm_pt_2 1");
 +    joshuaConfig.weights.add("tm_pt_3 1");
 +    joshuaConfig.weights.add("tm_pt_4 1");
 +    joshuaConfig.weights.add("tm_pt_5 1");
 +    joshuaConfig.weights.add("tm_glue_0 1");
 +    joshuaConfig.weights.add("OOVPenalty 2");
 +    joshuaConfig.num_parallel_decoders = 500; // This will enable 500 parallel
 +                                              // decoders to run at once.
 +                                              // Useful to help flush out
 +                                              // concurrency errors in
 +                                              // underlying
 +                                              // data-structures.
 +    this.decoder = new Decoder(joshuaConfig, ""); // Second argument
 +                                                  // (configFile)
 +                                                  // is not even used by the
 +                                                  // constructor/initialize.
 +
 +    previousLogLevel = Decoder.VERBOSE;
 +    Decoder.VERBOSE = 0;
 +  }
 +
 +  @After
 +  public void tearDown() throws Exception {
 +    this.decoder.cleanUp();
 +    this.decoder = null;
 +    Decoder.VERBOSE = previousLogLevel;
 +  }
 +
 +
 +
 +  // This test was created specifically to reproduce a multithreaded issue
 +  // related to mapped byte array access in the PackedGrammer getAlignmentArray
 +  // function.
 +
 +  // We'll test the decoding engine using N = 10,000 identical inputs. This
 +  // should be sufficient to induce concurrent data access for many shared
 +  // data structures.
 +
 +  @Test
 +  public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNResults() {
 +    // GIVEN
 +
 +    int inputLines = 10000;
 +    joshuaConfig.use_structured_output = true; // Enabled alignments.
 +    StringBuilder sb = new StringBuilder();
 +    for (int i = 0; i < inputLines; i++) {
 +      sb.append(INPUT + "\n");
 +    }
 +
 +    // Append a large string together to simulate N requests to the decoding
 +    // engine.
 +    TranslationRequestStream req = new TranslationRequestStream(
 +        new BufferedReader(new InputStreamReader(new ByteArrayInputStream(sb.toString()
 +        .getBytes(Charset.forName("UTF-8"))))), joshuaConfig);
 +    
 +    ByteArrayOutputStream output = new ByteArrayOutputStream();
 +
 +    // WHEN
 +    // Translate all spans in parallel.
 +    try {
 +      this.decoder.decodeAll(req, output);
 +    } catch (IOException e) {
 +      // TODO Auto-generated catch block
 +      e.printStackTrace();
 +    }
 +    ArrayList<Sentence> translationResults = new ArrayList<Sentence>();
 +
 +
 +    final long translationStartTime = System.nanoTime();
 +    Sentence t;
 +    try {
 +      while ((t = req.next()) != null) {
 +        translationResults.add(t);
 +      }
 +    } catch (MetaDataException e) {
 +      e.printStackTrace();
 +    } finally {
 +      if (output != null) {
 +        try {
 +          output.close();
 +        } catch (IOException e) {
 +          e.printStackTrace();
 +        }
 +      }
 +    }
 +
 +    final long translationEndTime = System.nanoTime();
 +    final double pipelineLoadDurationInSeconds = (translationEndTime - translationStartTime) / ((double)NANO_SECONDS_PER_SECOND);
 +    System.err.println(String.format("%.2f seconds", pipelineLoadDurationInSeconds));
 +
 +    // THEN
 +    assertTrue(translationResults.size() == inputLines);
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
index a78a4a1,0000000..69412e2
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
+++ b/src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
@@@ -1,272 -1,0 +1,272 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.system;
 +
 +import static java.util.Arrays.asList;
 +import static org.junit.Assert.assertEquals;
 +import static org.junit.Assert.assertTrue;
 +
 +import java.util.HashMap;
 +import java.util.List;
 +import java.util.Map;
 +
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.StructuredTranslation;
 +import org.apache.joshua.decoder.Translation;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +import org.junit.After;
 +import org.junit.Before;
 +import org.junit.Test;
 +
 +/**
 + * Integration test for the complete Joshua decoder using a toy grammar that translates
 + * a bunch of capital letters to lowercase letters. Rules in the test grammar
 + * drop and generate additional words and simulate reordering of rules, so that
 + * proper extraction of word alignments and other information from the decoder
 + * can be tested.
 + * 
 + * @author fhieber
 + */
 +public class StructuredTranslationTest {
 +
 +  private JoshuaConfiguration joshuaConfig = null;
 +  private Decoder decoder = null;
 +  private static final String INPUT = "A K B1 U Z1 Z2 B2 C";
 +  private static final String EXPECTED_TRANSLATION = "a b n1 u z c1 k1 k2 k3 n1 n2 n3 c2";
 +  private static final List<String> EXPECTED_TRANSLATED_TOKENS = asList(EXPECTED_TRANSLATION.split("\\s+"));
 +  private static final String EXPECTED_WORD_ALIGNMENT_STRING = "0-0 2-1 6-1 3-3 4-4 5-4 7-5 1-6 1-7 1-8 7-12";
 +  private static final List<List<Integer>> EXPECTED_WORD_ALIGNMENT = asList(
 +      asList(0), asList(2, 6), asList(), asList(3),
 +      asList(4, 5), asList(7), asList(1),
 +      asList(1), asList(1), asList(), asList(),
 +      asList(), asList(7));
 +  private static final double EXPECTED_SCORE = -17.0;
 +  private static final Map<String,Float> EXPECTED_FEATURES = new HashMap<>();
 +  private static final int EXPECTED_NBEST_LIST_SIZE = 8;
 +  static {
 +    EXPECTED_FEATURES.put("tm_glue_0", 1.0f);
 +    EXPECTED_FEATURES.put("tm_pt_0", -3.0f);
 +    EXPECTED_FEATURES.put("tm_pt_1", -3.0f);
 +    EXPECTED_FEATURES.put("tm_pt_2", -3.0f);
 +    EXPECTED_FEATURES.put("tm_pt_3", -3.0f);
 +    EXPECTED_FEATURES.put("tm_pt_4", -3.0f);
 +    EXPECTED_FEATURES.put("tm_pt_5", -3.0f);
 +    EXPECTED_FEATURES.put("OOV", 7.0f);
 +  }
 +
 +  @Before
 +  public void setUp() throws Exception {
 +    joshuaConfig = new JoshuaConfiguration();
 +    joshuaConfig.search_algorithm = "cky";
 +    joshuaConfig.mark_oovs = false;
 +    joshuaConfig.pop_limit = 100;
 +    joshuaConfig.use_unique_nbest = false;
 +    joshuaConfig.include_align_index = false;
 +    joshuaConfig.topN = 0;
 +    joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar");
 +    joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
 +    joshuaConfig.goal_symbol = "[GOAL]";
 +    joshuaConfig.default_non_terminal = "[X]";
-     joshuaConfig.features.add("feature_function = OOVPenalty");
++    joshuaConfig.features.add("OOVPenalty");
 +    joshuaConfig.weights.add("tm_pt_0 1");
 +    joshuaConfig.weights.add("tm_pt_1 1");
 +    joshuaConfig.weights.add("tm_pt_2 1");
 +    joshuaConfig.weights.add("tm_pt_3 1");
 +    joshuaConfig.weights.add("tm_pt_4 1");
 +    joshuaConfig.weights.add("tm_pt_5 1");
 +    joshuaConfig.weights.add("tm_glue_0 1");
 +    joshuaConfig.weights.add("OOVPenalty 1");
 +    decoder = new Decoder(joshuaConfig, ""); // second argument (configFile
 +                                             // is not even used by the
 +                                             // constructor/initialize)
 +  }
 +
 +  @After
 +  public void tearDown() throws Exception {
 +    decoder.cleanUp();
 +    decoder = null;
 +  }
 +
 +  private Translation decode(String input) {
 +    Sentence sentence = new Sentence(input, 0, joshuaConfig);
 +    return decoder.decode(sentence);
 +  }
 +  
 +  @Test
 +  public void givenInput_whenRegularOutputFormat_thenExpectedOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = false;
 +    joshuaConfig.outputFormat = "%s | %a ";
 +    
 +    // WHEN
 +    final String translation = decode(INPUT).toString().trim();
 +    
 +    // THEN
 +    assertEquals(EXPECTED_TRANSLATION + " | " + EXPECTED_WORD_ALIGNMENT_STRING, translation);
 +  }
 +  
 +  @Test
 +  public void givenInput_whenRegularOutputFormatWithTopN1_thenExpectedOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = false;
 +    joshuaConfig.outputFormat = "%s | %e | %a | %c";
 +    joshuaConfig.topN = 1;
 +    
 +    // WHEN
 +    final String translation = decode(INPUT).toString().trim();
 +    
 +    // THEN
 +    assertEquals(EXPECTED_TRANSLATION + " | " + INPUT + " | " + EXPECTED_WORD_ALIGNMENT_STRING + String.format(" | %.3f", EXPECTED_SCORE),
 +        translation);
 +  }
 +
 +  @Test
 +  public void givenInput_whenStructuredOutputFormatWithTopN0_thenExpectedOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = true;
 +    joshuaConfig.topN = 0;
 +    
 +    // WHEN
 +    final Translation translation = decode(INPUT);
 +    final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
 +    final String translationString = structuredTranslation.getTranslationString();
 +    final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
 +    final float translationScore = structuredTranslation.getTranslationScore();
 +    final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
 +    final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
 +    
 +    // THEN
 +    assertTrue(translation.getStructuredTranslations().size() == 1);
 +    assertEquals(EXPECTED_TRANSLATION, translationString);
 +    assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
 +    assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
 +    assertEquals(EXPECTED_WORD_ALIGNMENT, wordAlignment);
 +    assertEquals(wordAlignment.size(), translatedTokens.size());
 +    assertEquals(EXPECTED_FEATURES.entrySet(), translationFeatures.entrySet());
 +  }
 +  
 +  @Test
 +  public void givenInput_whenStructuredOutputFormatWithTopN1_thenExpectedOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = true;
 +    joshuaConfig.topN = 1;
 +    
 +    // WHEN
 +    final Translation translation = decode(INPUT);
 +    final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
 +    final StructuredTranslation structuredTranslation = structuredTranslations.get(0);
 +    final String translationString = structuredTranslation.getTranslationString();
 +    final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
 +    final float translationScore = structuredTranslation.getTranslationScore();
 +    final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
 +    final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
 +    
 +    // THEN
 +    assertTrue(structuredTranslations.size() == 1);
 +    assertEquals(EXPECTED_TRANSLATION, translationString);
 +    assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
 +    assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
 +    assertEquals(EXPECTED_WORD_ALIGNMENT, wordAlignment);
 +    assertEquals(wordAlignment.size(), translatedTokens.size());
 +    assertEquals(EXPECTED_FEATURES.entrySet(), translationFeatures.entrySet());
 +  }
 +  
 +  @Test
 +  public void givenInput_whenStructuredOutputFormatWithKBest_thenExpectedOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = true;
 +    joshuaConfig.topN = 100;
 +    
 +    // WHEN
 +    final Translation translation = decode(INPUT);
 +    final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
 +    final StructuredTranslation viterbiTranslation = structuredTranslations.get(0);
 +    final StructuredTranslation lastKBest = structuredTranslations.get(structuredTranslations.size() - 1);
 +    
 +    // THEN
 +    assertEquals(structuredTranslations.size(), EXPECTED_NBEST_LIST_SIZE);
 +    assertTrue(structuredTranslations.size() > 1);
 +    assertEquals(EXPECTED_TRANSLATION, viterbiTranslation.getTranslationString());
 +    assertEquals(EXPECTED_TRANSLATED_TOKENS, viterbiTranslation.getTranslationTokens());
 +    assertEquals(EXPECTED_SCORE, viterbiTranslation.getTranslationScore(), 0.00001);
 +    assertEquals(EXPECTED_WORD_ALIGNMENT, viterbiTranslation.getTranslationWordAlignments());
 +    assertEquals(EXPECTED_FEATURES.entrySet(), viterbiTranslation.getTranslationFeatures().entrySet());
 +    // last entry in KBEST is all input words untranslated, should have 8 OOVs.
 +    assertEquals(INPUT, lastKBest.getTranslationString());
 +    assertEquals(-800.0, lastKBest.getTranslationFeatures().get("OOVPenalty"), 0.0001);
 +    
 +  }
 +  
 +  @Test
 +  public void givenEmptyInput_whenStructuredOutputFormat_thenEmptyOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = true;
 +    
 +    // WHEN
 +    final Translation translation = decode("");
 +    final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
 +    final String translationString = structuredTranslation.getTranslationString();
 +    final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
 +    final float translationScore = structuredTranslation.getTranslationScore();
 +    final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
 +    
 +    // THEN
 +    assertEquals("", translationString);
 +    assertTrue(translatedTokens.isEmpty());
 +    assertEquals(0, translationScore, 0.00001);
 +    assertTrue(wordAlignment.isEmpty());
 +  }
 +  
 +  @Test
 +  public void givenOOVInput_whenStructuredOutputFormat_thenOOVOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = true;
 +    final String input = "gabarbl";
 +    
 +    // WHEN
 +    final Translation translation = decode(input);
 +    final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
 +    final String translationString = structuredTranslation.getTranslationString();
 +    final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
 +    final float translationScore = structuredTranslation.getTranslationScore();
 +    final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
 +    
 +    // THEN
 +    assertEquals(input, translationString);
 +    assertTrue(translatedTokens.contains(input));
 +    assertEquals(-99.0, translationScore, 0.00001);
 +    assertTrue(wordAlignment.contains(asList(0)));
 +  }
 +  
 +  @Test
 +  public void givenEmptyInput_whenRegularOutputFormat_thenNewlineOutput() {
 +    // GIVEN
 +    joshuaConfig.use_structured_output = false;
 +    
 +    // WHEN
 +    final Translation translation = decode("");
 +    final String translationString = translation.toString();
 +    
 +    // THEN
 +    assertEquals("\n", translationString);
 +  }
 +
 +}



[4/5] incubator-joshua git commit: Merge branch 'sparse' of https://github.com/fhieber/incubator-joshua into JOSHUA-PR21

Posted by mj...@apache.org.
Merge branch 'sparse' of https://github.com/fhieber/incubator-joshua into JOSHUA-PR21

# Conflicts:
#	lib/ivy.xml
#	src/main/java/org/apache/joshua/decoder/Decoder.java
#	src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
#	src/main/java/org/apache/joshua/decoder/ff/RuleShape.java


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5c0d5388
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5c0d5388
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5c0d5388

Branch: refs/heads/JOSHUA-252
Commit: 5c0d5388ae7a76538337bf89bd6ac9a04d2c6dff
Parents: 9e70266 5591c67
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 15:39:04 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 15:39:04 2016 -0400

----------------------------------------------------------------------
 lib/ivy.xml                                     |  17 +++
 src/joshua/decoder/ff/LexicalFeatures.java      | 131 +++++++++++++++++++
 .../org/apache/joshua/corpus/Vocabulary.java    |  13 +-
 .../java/org/apache/joshua/decoder/Decoder.java |  17 ++-
 .../joshua/decoder/JoshuaConfiguration.java     |  10 +-
 .../apache/joshua/decoder/ff/OOVPenalty.java    |  15 ++-
 .../org/apache/joshua/decoder/ff/RuleFF.java    | 109 +++++++++------
 .../apache/joshua/decoder/ff/RuleLength.java    |  13 +-
 .../org/apache/joshua/decoder/ff/RuleShape.java |  67 +++++++---
 .../apache/joshua/decoder/ff/WordPenalty.java   |  10 +-
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |   2 +-
 .../system/MultithreadedTranslationTests.java   |   2 +-
 .../system/StructuredTranslationTest.java       |   2 +-
 13 files changed, 314 insertions(+), 94 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/Vocabulary.java
index 8416e4a,0000000..f1bf53d
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/Vocabulary.java
+++ b/src/main/java/org/apache/joshua/corpus/Vocabulary.java
@@@ -1,295 -1,0 +1,302 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.corpus;
 +
 +import java.io.BufferedInputStream;
 +import java.io.BufferedOutputStream;
 +import java.io.DataInputStream;
 +import java.io.DataOutputStream;
 +import java.io.Externalizable;
 +import java.io.File;
 +import java.io.FileInputStream;
 +import java.io.FileOutputStream;
 +import java.io.IOException;
 +import java.io.ObjectInput;
 +import java.io.ObjectOutput;
 +import java.util.ArrayList;
 +import java.util.HashMap;
 +import java.util.List;
 +import java.util.Map;
 +import java.util.concurrent.locks.StampedLock;
 +
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.ff.lm.NGramLanguageModel;
 +import org.apache.joshua.util.FormatUtils;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * Static singular vocabulary class.
 + * Supports (de-)serialization into a vocabulary file.
 + *
 + * @author Juri Ganitkevitch
 + */
 +
 +public class Vocabulary implements Externalizable {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(Vocabulary.class);
 +  private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<>();
 +
 +  private static List<String> idToString;
 +  private static Map<String, Integer> stringToId;
 +  private static final StampedLock lock = new StampedLock();
 +
 +  static final int UNKNOWN_ID = 0;
 +  static final String UNKNOWN_WORD = "<unk>";
 +
 +  public static final String START_SYM = "<s>";
 +  public static final String STOP_SYM = "</s>";
 +
 +  static {
 +    clear();
 +  }
 +
 +  public static boolean registerLanguageModel(NGramLanguageModel lm) {
 +    long lock_stamp = lock.writeLock();
 +    try {
 +      // Store the language model.
 +      LMs.add(lm);
 +      // Notify it of all the existing words.
 +      boolean collision = false;
 +      for (int i = idToString.size() - 1; i > 0; i--)
 +        collision = collision || lm.registerWord(idToString.get(i), i);
 +      return collision;
 +    } finally {
 +      lock.unlockWrite(lock_stamp);
 +    }
 +  }
 +
 +  /**
 +   * Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to
 +   * reading the file.
 +   *
 +   * @param vocab_file path to a vocabulary file
 +   * @return Returns true if vocabulary was read without mismatches or collisions.
 +   * @throws IOException of the file cannot be found or read properly
 +   */
 +  public static boolean read(final File vocab_file) throws IOException {
 +    DataInputStream vocab_stream =
 +        new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
 +    int size = vocab_stream.readInt();
 +    LOG.info("Read {} entries from the vocabulary", size);
 +    clear();
 +    for (int i = 0; i < size; i++) {
 +      int id = vocab_stream.readInt();
 +      String token = vocab_stream.readUTF();
 +      if (id != Math.abs(id(token))) {
 +        vocab_stream.close();
 +        return false;
 +      }
 +    }
 +    vocab_stream.close();
 +    return (size + 1 == idToString.size());
 +  }
 +
 +  public static void write(String file_name) throws IOException {
 +    long lock_stamp =lock.readLock();
 +    try {
 +      File vocab_file = new File(file_name);
 +      DataOutputStream vocab_stream =
 +          new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
 +      vocab_stream.writeInt(idToString.size() - 1);
 +      LOG.info("Writing vocabulary: {} tokens", idToString.size() - 1);
 +      for (int i = 1; i < idToString.size(); i++) {
 +        vocab_stream.writeInt(i);
 +        vocab_stream.writeUTF(idToString.get(i));
 +      }
 +      vocab_stream.close();
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  /**
 +   * Get the id of the token if it already exists, new id is created otherwise.
 +   *
 +   * TODO: currently locks for every call. Separate constant (frozen) ids from
 +   * changing (e.g. OOV) ids. Constant ids could be immutable -&gt; no locking.
 +   * Alternatively: could we use ConcurrentHashMap to not have to lock if
 +   * actually contains it and only lock for modifications?
 +   * 
 +   * @param token a token to obtain an id for
 +   * @return the token id
 +   */
 +  public static int id(String token) {
 +    // First attempt an optimistic read
 +    long attempt_read_lock = lock.tryOptimisticRead();
 +    if (stringToId.containsKey(token)) {
 +      int resultId = stringToId.get(token);
 +      if (lock.validate(attempt_read_lock)) {
 +        return resultId;
 +      }
 +    }
 +
 +    // The optimistic read failed, try a read with a stamped read lock
 +    long read_lock_stamp = lock.readLock();
 +    try {
 +      if (stringToId.containsKey(token)) {
 +        return stringToId.get(token);
 +      }
 +    } finally {
 +      lock.unlockRead(read_lock_stamp);
 +    }
 +
 +    // Looks like the id we want is not there, let's get a write lock and add it
 +    long write_lock_stamp = lock.writeLock();
 +    try {
 +      if (stringToId.containsKey(token)) {
 +        return stringToId.get(token);
 +      }
 +      int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);
 +
 +      // register this (token,id) mapping with each language
 +      // model, so that they can map it to their own private
 +      // vocabularies
 +      for (NGramLanguageModel lm : LMs)
 +        lm.registerWord(token, Math.abs(id));
 +
 +      idToString.add(token);
 +      stringToId.put(token, id);
 +      return id;
 +    } finally {
 +      lock.unlockWrite(write_lock_stamp);
 +    }
 +  }
 +
 +  public static boolean hasId(int id) {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      id = Math.abs(id);
 +      return (id < idToString.size());
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static int[] addAll(String sentence) {
 +    return addAll(sentence.split("\\s+"));
 +  }
 +
 +  public static int[] addAll(String[] tokens) {
 +    int[] ids = new int[tokens.length];
 +    for (int i = 0; i < tokens.length; i++)
 +      ids[i] = id(tokens[i]);
 +    return ids;
 +  }
 +
 +  public static String word(int id) {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      id = Math.abs(id);
 +      return idToString.get(id);
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static String getWords(int[] ids) {
-     if (ids.length == 0) return "";
++    return getWords(ids, " ");
++  }
++  
++  public static String getWords(int[] ids, final String separator) {
++    if (ids.length == 0) {
++      return "";
++    }
 +    StringBuilder sb = new StringBuilder();
-     for (int i = 0; i < ids.length - 1; i++)
-       sb.append(word(ids[i])).append(" ");
++    for (int i = 0; i < ids.length - 1; i++) {
++      sb.append(word(ids[i])).append(separator);
++    }
 +    return sb.append(word(ids[ids.length - 1])).toString();
 +  }
 +
 +  public static String getWords(final Iterable<Integer> ids) {
 +    StringBuilder sb = new StringBuilder();
 +    for (int id : ids)
 +      sb.append(word(id)).append(" ");
 +    return sb.deleteCharAt(sb.length() - 1).toString();
 +  }
 +
 +  public static int getUnknownId() {
 +    return UNKNOWN_ID;
 +  }
 +
 +  public static String getUnknownWord() {
 +    return UNKNOWN_WORD;
 +  }
 +
 +  public static int size() {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      return idToString.size();
 +    } finally {
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static synchronized int getTargetNonterminalIndex(int id) {
 +    return FormatUtils.getNonterminalIndex(word(id));
 +  }
 +
 +  /**
 +   * Clears the vocabulary and initializes it with an unknown word. Registered
 +   * language models are left unchanged.
 +   */
 +  public static void clear() {
 +    long lock_stamp = lock.writeLock();
 +    try {
 +      idToString = new ArrayList<String>();
 +      stringToId = new HashMap<String, Integer>();
 +
 +      idToString.add(UNKNOWN_ID, UNKNOWN_WORD);
 +      stringToId.put(UNKNOWN_WORD, UNKNOWN_ID);
 +    } finally {
 +      lock.unlockWrite(lock_stamp);
 +    }
 +  }
 +
 +  public static void unregisterLanguageModels() {
 +    LMs.clear();
 +  }
 +
 +  @Override
 +  public void writeExternal(ObjectOutput out) throws IOException {
 +    // TODO Auto-generated method stub
 +
 +  }
 +
 +  @Override
 +  public void readExternal(ObjectInput in)
 +      throws IOException, ClassNotFoundException {
 +    // TODO Auto-generated method stub
 +
 +  }
 +
 +  @Override
 +  public boolean equals(Object o) {
 +    if(getClass() == o.getClass()) {
 +      return true;
 +    } else {
 +      return false;
 +    }
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/Decoder.java
index 8535b11,0000000..6fa5eb8
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/Decoder.java
+++ b/src/main/java/org/apache/joshua/decoder/Decoder.java
@@@ -1,975 -1,0 +1,974 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
 +import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
 +
 +import java.io.BufferedWriter;
 +import java.io.File;
 +import java.io.IOException;
 +import java.io.OutputStream;
 +import java.io.FileNotFoundException;
 +import java.io.FileWriter;
 +import java.lang.reflect.Constructor;
 +import java.util.ArrayList;
 +import java.util.HashMap;
 +import java.util.HashSet;
 +import java.util.List;
 +import java.util.concurrent.ArrayBlockingQueue;
 +import java.util.concurrent.BlockingQueue;
 +
 +import com.google.common.base.Strings;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.FeatureVector;
 +import org.apache.joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
 +import org.apache.joshua.decoder.JoshuaConfiguration.SERVER_TYPE;
 +import org.apache.joshua.decoder.ff.FeatureFunction;
 +import org.apache.joshua.decoder.ff.PhraseModel;
 +import org.apache.joshua.decoder.ff.StatefulFF;
 +import org.apache.joshua.decoder.ff.lm.LanguageModelFF;
 +import org.apache.joshua.decoder.ff.tm.Grammar;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.ff.tm.Trie;
 +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
 +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 +import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar;
 +import org.apache.joshua.decoder.io.JSONMessage;
 +import org.apache.joshua.decoder.io.TranslationRequestStream;
 +import org.apache.joshua.decoder.phrase.PhraseTable;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +import org.apache.joshua.util.FileUtility;
 +import org.apache.joshua.util.FormatUtils;
 +import org.apache.joshua.util.Regex;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * This class handles decoder initialization and the complication introduced by multithreading.
 + *
 + * After initialization, the main entry point to the Decoder object is
 + * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable
 + * Translations object. It is important that we support multithreading both (a) across the sentences
 + * within a request and (b) across requests, in a round-robin fashion. This is done by maintaining a
 + * fixed sized concurrent thread pool. When a new request comes in, a RequestParallelizer thread is
 + * launched. This object iterates over the request's sentences, obtaining a thread from the
 + * thread pool, and using that thread to decode the sentence. If a decoding thread is not available,
 + * it will block until one is in a fair (FIFO) manner. RequestParallelizer thereby permits intra-request
 + * parallelization by separating out reading the input stream from processing the translated sentences,
 + * but also ensures that round-robin parallelization occurs, since RequestParallelizer uses the
 + * thread pool before translating each request.
 + *
 + * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose
 + * of the runner is to record where to place the translated sentence when it is done (i.e., which
 + * Translations object). Translations itself is an iterator whose next() call blocks until the next
 + * translation is available.
 + *
 + * @author Matt Post post@cs.jhu.edu
 + * @author Zhifei Li, zhifei.work@gmail.com
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @author Lane Schwartz dowobeha@users.sourceforge.net
 + */
 +public class Decoder {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(Decoder.class);
 +
 +  private final JoshuaConfiguration joshuaConfiguration;
 +
 +  public JoshuaConfiguration getJoshuaConfiguration() {
 +    return joshuaConfiguration;
 +  }
 +
 +  /*
 +   * Many of these objects themselves are global objects. We pass them in when constructing other
 +   * objects, so that they all share pointers to the same object. This is good because it reduces
 +   * overhead, but it can be problematic because of unseen dependencies (for example, in the
 +   * Vocabulary shared by language model, translation grammar, etc).
 +   */
 +  private List<Grammar> grammars;
 +  private ArrayList<FeatureFunction> featureFunctions;
 +  private PhraseTable customPhraseTable;
 +
 +  /* The feature weights. */
 +  public static FeatureVector weights;
 +
 +  public static int VERBOSE = 1;
 +
 +  private BlockingQueue<DecoderThread> threadPool = null;
 +
 +  // ===============================================================
 +  // Constructors
 +  // ===============================================================
 +
 +  /**
 +   * Constructor method that creates a new decoder using the specified configuration file.
 +   *
 +   * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
 +   * @param configFile name of configuration file.
 +   */
 +  public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) {
 +    this(joshuaConfiguration);
 +    this.initialize(configFile);
 +  }
 +
 +  /**
 +   * Factory method that creates a new decoder using the specified configuration file.
 +   *
 +   * @param configFile Name of configuration file.
 +   * @return a configured {@link org.apache.joshua.decoder.Decoder}
 +   */
 +  public static Decoder createDecoder(String configFile) {
 +    JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
 +    return new Decoder(joshuaConfiguration, configFile);
 +  }
 +
 +  /**
 +   * Constructs an uninitialized decoder for use in testing.
 +   * <p>
 +   * This method is private because it should only ever be called by the
 +   * {@link #getUninitalizedDecoder()} method to provide an uninitialized decoder for use in
 +   * testing.
 +   */
 +  private Decoder(JoshuaConfiguration joshuaConfiguration) {
 +    this.joshuaConfiguration = joshuaConfiguration;
 +    this.grammars = new ArrayList<Grammar>();
 +    this.threadPool = new ArrayBlockingQueue<DecoderThread>(
 +        this.joshuaConfiguration.num_parallel_decoders, true);
 +    this.customPhraseTable = null;
 +  }
 +
 +  /**
 +   * Gets an uninitialized decoder for use in testing.
 +   * <p>
 +   * This method is called by unit tests or any outside packages (e.g., MERT) relying on the
 +   * decoder.
 +   * @param joshuaConfiguration a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
 +   * @return an uninitialized decoder for use in testing
 +   */
 +  static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) {
 +    return new Decoder(joshuaConfiguration);
 +  }
 +
 +  // ===============================================================
 +  // Public Methods
 +  // ===============================================================
 +
 +  /**
 +   * This class is responsible for getting sentences from the TranslationRequest and procuring a
 +   * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a
 +   * thread that will read the request's sentences, obtain a DecoderThread to translate them, and
 +   * then place the Translation in the appropriate place.
 +   *
 +   * @author Matt Post <po...@cs.jhu.edu>
 +   *
 +   */
 +  private class RequestParallelizer extends Thread {
 +    /* Source of sentences to translate. */
 +    private final TranslationRequestStream request;
 +
 +    /* Where to put translated sentences. */
 +    private final Translations response;
 +
 +    /* Sometimes we need to communicate with the client even when we didn't get a new sentence
 +     * (e.g., metadata)
 +     */
 +    private OutputStream out;
 +
 +    RequestParallelizer(TranslationRequestStream request, Translations response, OutputStream out) {
 +      this.request = request;
 +      this.response = response;
 +      this.out = out;
 +    }
 +
 +    @Override
 +    public void run() {
 +      /*
 +       * Repeatedly get an input sentence, wait for a DecoderThread, and then start a new thread to
 +       * translate the sentence. We start a new thread (via DecoderRunnerThread) as opposed to
 +       * blocking, so that the RequestHandler can go on to the next sentence in this request, which
 +       * allows parallelization across the sentences of the request.
 +       */
 +      for (;;) {
 +        Sentence sentence = null;
 +        try {
 +          sentence = request.next();
 +
 +        } catch (MetaDataException meta) {
 +          try {
 +            handleMetadata(meta);
 +          } catch (IOException e) {
 +            e.printStackTrace();
 +          }
 +
 +          continue;
 +        }
 +
 +        if (sentence == null) {
 +          response.finish();
 +          break;
 +        }
 +
 +        // This will block until a DecoderThread becomes available.
 +        DecoderThread thread = Decoder.this.getThread();
 +        new DecoderThreadRunner(thread, sentence, response).start();
 +      }
 +    }
 +
 +    /**
 +     * When metadata is found on the input, it needs to be processed. That is done here. Sometimes
 +     * this involves returning data to the client.
 +     *
 +     * @param meta
 +     * @throws IOException
 +     */
 +    private void handleMetadata(MetaDataException meta) throws IOException {
 +      if (meta.type().equals("set_weight")) {
 +        // Change a decoder weight
 +        String[] tokens = meta.tokens();
 +        if (tokens.length != 3) {
 +          LOG.error("weight change requires three tokens");
 +        } else {
 +          float old_weight = Decoder.weights.getWeight(tokens[1]);
 +          Decoder.weights.set(tokens[1], Float.parseFloat(tokens[2]));
 +          LOG.error("@set_weight: {} {} -> {}", tokens[1], old_weight,
 +              Decoder.weights.getWeight(tokens[1]));
 +        }
 +
 +        // TODO: return a JSON object with this weight or all weights
 +        out.write("".getBytes());
 +
 +      } else if (meta.type().equals("get_weight")) {
 +        // TODO: add to JSON object, send back
 +
 +        String[] tokens = meta.tokens();
 +
 +        LOG.error("{} = {}", tokens[1], Decoder.weights.getWeight(tokens[1]));
 +
 +        out.write("".getBytes());
 +
 +      } else if (meta.type().equals("add_rule")) {
 +        String tokens[] = meta.tokens(" \\|\\|\\| ");
 +
 +        if (tokens.length != 2) {
 +          LOG.error("* INVALID RULE '{}'", meta);
 +          out.write("bad rule".getBytes());
 +          return;
 +        }
 +
 +        Rule rule = new HieroFormatReader().parseLine(
 +            String.format("[X] ||| [X,1] %s ||| [X,1] %s ||| custom=1", tokens[0], tokens[1]));
 +        Decoder.this.customPhraseTable.addRule(rule);
 +        rule.estimateRuleCost(featureFunctions);
 +        LOG.info("Added custom rule {}", formatRule(rule));
 +
 +        String response = String.format("Added rule %s", formatRule(rule));
 +        out.write(response.getBytes());
 +
 +      } else if (meta.type().equals("list_rules")) {
 +
 +        JSONMessage message = new JSONMessage();
 +
 +        // Walk the the grammar trie
 +        ArrayList<Trie> nodes = new ArrayList<Trie>();
 +        nodes.add(customPhraseTable.getTrieRoot());
 +
 +        while (nodes.size() > 0) {
 +          Trie trie = nodes.remove(0);
 +
 +          if (trie == null)
 +            continue;
 +
 +          if (trie.hasRules()) {
 +            for (Rule rule: trie.getRuleCollection().getRules()) {
 +              message.addRule(formatRule(rule));
 +            }
 +          }
 +
 +          if (trie.getExtensions() != null)
 +            nodes.addAll(trie.getExtensions());
 +        }
 +
 +        out.write(message.toString().getBytes());
 +
 +      } else if (meta.type().equals("remove_rule")) {
 +        // Remove a rule from a custom grammar, if present
 +        String[] tokens = meta.tokenString().split(" \\|\\|\\| ");
 +        if (tokens.length != 2) {
 +          out.write(String.format("Invalid delete request: '%s'", meta.tokenString()).getBytes());
 +          return;
 +        }
 +
 +        // Search for the rule in the trie
 +        int nt_i = Vocabulary.id(joshuaConfiguration.default_non_terminal);
 +        Trie trie = customPhraseTable.getTrieRoot().match(nt_i);
 +
 +        for (String word: tokens[0].split("\\s+")) {
 +          int id = Vocabulary.id(word);
 +          Trie nextTrie = trie.match(id);
 +          if (nextTrie != null)
 +            trie = nextTrie;
 +        }
 +
 +        if (trie.hasRules()) {
 +          Rule matched = null;
 +          for (Rule rule: trie.getRuleCollection().getRules()) {
 +            String target = rule.getEnglishWords();
 +            target = target.substring(target.indexOf(' ') + 1);
 +
 +            if (tokens[1].equals(target)) {
 +              matched = rule;
 +              break;
 +            }
 +          }
 +          trie.getRuleCollection().getRules().remove(matched);
 +          out.write(String.format("Removed rule %s", formatRule(matched)).getBytes());
 +          return;
 +        }
 +
 +        out.write(String.format("No such rule %s", meta.tokenString()).getBytes());
 +      }
 +    }
 +
 +    /**
 +     * Strips the nonterminals from the lefthand side of the rule.
 +     *
 +     * @param rule
 +     * @return
 +     */
 +    private String formatRule(Rule rule) {
 +      String ruleString = "";
 +      boolean first = true;
 +      for (int word: rule.getFrench()) {
 +        if (!first)
 +          ruleString += " " + Vocabulary.word(word);
 +        first = false;
 +      }
 +
 +      ruleString += " |||"; // space will get added with first English word
 +      first = true;
 +      for (int word: rule.getEnglish()) {
 +        if (!first)
 +          ruleString += " " + Vocabulary.word(word);
 +        first = false;
 +      }
 +
 +      // strip of the leading space
 +      return ruleString.substring(1);
 +    }
 +  }
 +
 +  /**
 +   * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in
 +   * a fair fashion (i.e,. FIFO across requests).
 +   *
 +   * @return a thread that can be used for decoding.
 +   */
 +  public DecoderThread getThread() {
 +    try {
 +      return threadPool.take();
 +    } catch (InterruptedException e) {
 +      // TODO Auto-generated catch block
 +      e.printStackTrace();
 +    }
 +    return null;
 +  }
 +
 +  /**
 +   * This class handles running a DecoderThread (which takes care of the actual translation of an
 +   * input Sentence, returning a Translation object when its done). This is done in a thread so as
 +   * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in
 +   * the TranslationRequest, in turn permitting parallelization across the sentences of a request.
 +   *
 +   * When the decoder thread is finshed, the Translation object is placed in the correct place in
 +   * the corresponding Translations object that was returned to the caller of
 +   * Decoder.decodeAll(TranslationRequest).
 +   *
 +   * @author Matt Post <po...@cs.jhu.edu>
 +   */
 +  private class DecoderThreadRunner extends Thread {
 +
 +    private final DecoderThread decoderThread;
 +    private final Sentence sentence;
 +    private final Translations translations;
 +
 +    DecoderThreadRunner(DecoderThread thread, Sentence sentence, Translations translations) {
 +      this.decoderThread = thread;
 +      this.sentence = sentence;
 +      this.translations = translations;
 +    }
 +
 +    @Override
 +    public void run() {
 +      /*
 +       * Use the thread to translate the sentence. Then record the translation with the
 +       * corresponding Translations object, and return the thread to the pool.
 +       */
 +      try {
 +        Translation translation = decoderThread.translate(this.sentence);
 +        translations.record(translation);
 +
 +        /*
 +         * This is crucial! It's what makes the thread available for the next sentence to be
 +         * translated.
 +         */
 +        threadPool.put(decoderThread);
 +      } catch (Exception e) {
 +        throw new RuntimeException(String.format(
 +            "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage()), e);
 +        //        translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration));
 +      }
 +    }
 +  }
 +
 +  /**
 +   * This function is the main entry point into the decoder. It translates all the sentences in a
 +   * (possibly boundless) set of input sentences. Each request launches its own thread to read the
 +   * sentences of the request.
 +   *
 +   * @param request the populated {@link org.apache.joshua.decoder.io.TranslationRequestStream}
 +   * @param out an appropriate {@link java.io.OutputStream} to write results to
 +   * @throws IOException if there is an error with the input stream or writing the output
 +   */
 +  public void decodeAll(TranslationRequestStream request, OutputStream out) throws IOException {
 +    Translations translations = new Translations(request);
 +
 +    /* Start a thread to handle requests on the input stream */
 +    new RequestParallelizer(request, translations, out).start();
 +
 +    // Create the n-best output stream
 +    FileWriter nbest_out = null;
 +    if (joshuaConfiguration.n_best_file != null)
 +      nbest_out = new FileWriter(joshuaConfiguration.n_best_file);
 +
 +    for (;;) {
 +      Translation translation = translations.next();
 +      if (translation == null)
 +        break;
 +
 +      if (joshuaConfiguration.input_type == INPUT_TYPE.json || joshuaConfiguration.server_type == SERVER_TYPE.HTTP) {
 +        JSONMessage message = JSONMessage.buildMessage(translation);
 +        out.write(message.toString().getBytes());
 +
 +      } else {
 +        /**
 +         * We need to munge the feature value outputs in order to be compatible with Moses tuners.
 +         * Whereas Joshua writes to STDOUT whatever is specified in the `output-format` parameter,
 +         * Moses expects the simple translation on STDOUT and the n-best list in a file with a fixed
 +         * format.
 +         */
 +        String text;
 +        if (joshuaConfiguration.moses) {
 +          text = translation.toString().replaceAll("=", "= ");
 +          // Write the complete formatted string to STDOUT
 +          if (joshuaConfiguration.n_best_file != null)
 +            nbest_out.write(text);
 +
 +          // Extract just the translation and output that to STDOUT
 +          text = text.substring(0,  text.indexOf('\n'));
 +          String[] fields = text.split(" \\|\\|\\| ");
 +          text = fields[1] + "\n";
 +
 +        } else {
 +          text = translation.toString();
 +        }
 +
 +        out.write(text.getBytes());
 +      }
 +      out.flush();
 +    }
 +
 +    if (joshuaConfiguration.n_best_file != null)
 +      nbest_out.close();
 +  }
 +
 +
 +  /**
 +   * We can also just decode a single sentence.
 +   *
 +   * @param sentence {@link org.apache.joshua.lattice.Lattice} input
 +   * @return the sentence {@link org.apache.joshua.decoder.Translation}
 +   */
 +  public Translation decode(Sentence sentence) {
 +    // Get a thread.
 +
 +    try {
 +      DecoderThread thread = threadPool.take();
 +      Translation translation = thread.translate(sentence);
 +      threadPool.put(thread);
 +
 +      return translation;
 +
 +    } catch (InterruptedException e) {
 +      e.printStackTrace();
 +    }
 +
 +    return null;
 +  }
 +
 +  /**
 +   * Clean shutdown of Decoder, resetting all
 +   * static variables, such that any other instance of Decoder
 +   * afterwards gets a fresh start.
 +   */
 +  public void cleanUp() {
 +    // shut down DecoderThreads
 +    for (DecoderThread thread : threadPool) {
 +      try {
 +        thread.join();
 +      } catch (InterruptedException e) {
 +        e.printStackTrace();
 +      }
 +    }
 +    resetGlobalState();
 +  }
 +
 +  public static void resetGlobalState() {
 +    // clear/reset static variables
 +    DENSE_FEATURE_NAMES.clear();
 +    Vocabulary.clear();
 +    Vocabulary.unregisterLanguageModels();
 +    LanguageModelFF.resetLmIndex();
 +    StatefulFF.resetGlobalStateIndex();
 +  }
 +
 +  public static void writeConfigFile(double[] newWeights, String template, String outputFile,
 +      String newDiscriminativeModel) {
 +    try {
 +      int columnID = 0;
 +
 +      BufferedWriter writer = FileUtility.getWriteFileStream(outputFile);
 +      LineReader reader = new LineReader(template);
 +      try {
 +        for (String line : reader) {
 +          line = line.trim();
 +          if (Regex.commentOrEmptyLine.matches(line) || line.indexOf("=") != -1) {
 +            // comment, empty line, or parameter lines: just copy
 +            writer.write(line);
 +            writer.newLine();
 +
 +          } else { // models: replace the weight
 +            String[] fds = Regex.spaces.split(line);
 +            StringBuffer newSent = new StringBuffer();
 +            if (!Regex.floatingNumber.matches(fds[fds.length - 1])) {
 +              throw new IllegalArgumentException("last field is not a number; the field is: "
 +                  + fds[fds.length - 1]);
 +            }
 +
 +            if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) {
 +              newSent.append(fds[0]).append(' ');
 +              newSent.append(newDiscriminativeModel).append(' ');// change the
 +              // file name
 +              for (int i = 2; i < fds.length - 1; i++) {
 +                newSent.append(fds[i]).append(' ');
 +              }
 +            } else {// regular
 +              for (int i = 0; i < fds.length - 1; i++) {
 +                newSent.append(fds[i]).append(' ');
 +              }
 +            }
 +            if (newWeights != null)
 +              newSent.append(newWeights[columnID++]);// change the weight
 +            else
 +              newSent.append(fds[fds.length - 1]);// do not change
 +
 +            writer.write(newSent.toString());
 +            writer.newLine();
 +          }
 +        }
 +      } finally {
 +        reader.close();
 +        writer.close();
 +      }
 +
 +      if (newWeights != null && columnID != newWeights.length) {
 +        throw new IllegalArgumentException("number of models does not match number of weights");
 +      }
 +
 +    } catch (IOException e) {
 +      e.printStackTrace();
 +    }
 +  }
 +
 +  // ===============================================================
 +  // Initialization Methods
 +  // ===============================================================
 +
 +  /**
 +   * Moses requires the pattern .*_.* for sparse features, and prohibits underscores in dense features. 
 +   * This conforms to that pattern. We assume non-conforming dense features start with tm_ or lm_,
 +   * and the only sparse feature that needs converting is OOVPenalty.
 +   *
 +   * @param feature
 +   * @return the feature in Moses format
 +   */
 +  private String mosesize(String feature) {
 +    if (joshuaConfiguration.moses) {
 +      if (feature.startsWith("tm_") || feature.startsWith("lm_"))
 +        return feature.replace("_", "-");
 +    }
 +
 +    return feature;
 +  }
 +
 +  /**
 +   * Initialize all parts of the JoshuaDecoder.
 +   *
 +   * @param configFile File containing configuration options
 +   * @return An initialized decoder
 +   */
 +  public Decoder initialize(String configFile) {
 +    try {
 +
 +      long pre_load_time = System.currentTimeMillis();
 +
 +      /* Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
 +       * in the Joshua config file. Config file values take precedent.
 +       */
 +      this.readWeights(joshuaConfiguration.weights_file);
 +      
 +      
 +      /* Add command-line-passed weights to the weights array for processing below */
 +      if (!Strings.isNullOrEmpty(joshuaConfiguration.weight_overwrite)) {
 +        String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
 +        for (int i = 0; i < tokens.length; i += 2) {
 +          String feature = tokens[i];
 +          float value = Float.parseFloat(tokens[i+1]);
 +
 +          if (joshuaConfiguration.moses)
 +            feature = demoses(feature);
 +
 +          joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1]));
 +          LOG.info("COMMAND LINE WEIGHT: {} -> {}", feature, value);
 +        }
 +      }
 +
 +      /* Read the weights found in the config file */
 +      for (String pairStr: joshuaConfiguration.weights) {
 +        String pair[] = pairStr.split("\\s+");
 +
 +        /* Sanity check for old-style unsupported feature invocations. */
 +        if (pair.length != 2) {
 +          StringBuilder errMsg = new StringBuilder();
 +          errMsg.append("FATAL: Invalid feature weight line found in config file.\n");
 +          errMsg.append(String.format("The line was '%s'\n", pairStr));
 +          errMsg.append("You might be using an old version of the config file that is no longer supported\n");
 +          errMsg.append("Check joshua-decoder.org or email joshua_support@googlegroups.com for help\n");
 +          errMsg.append("Code = " + 17);
 +          throw new RuntimeException(errMsg.toString());
 +        }
 +
 +        weights.set(pair[0], Float.parseFloat(pair[1]));
 +      }
 +
 +      LOG.info("Read {} weights ({} of them dense)", weights.size(), DENSE_FEATURE_NAMES.size());
 +
 +      // Do this before loading the grammars and the LM.
 +      this.featureFunctions = new ArrayList<FeatureFunction>();
 +
 +      // Initialize and load grammars. This must happen first, since the vocab gets defined by
 +      // the packed grammar (if any)
 +      this.initializeTranslationGrammars();
 +      LOG.info("Grammar loading took: {} seconds.",
 +          (System.currentTimeMillis() - pre_load_time) / 1000);
 +
 +      // Initialize the features: requires that LM model has been initialized.
 +      this.initializeFeatureFunctions();
 +
 +      // This is mostly for compatibility with the Moses tuning script
 +      if (joshuaConfiguration.show_weights_and_quit) {
 +        for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
 +          String name = DENSE_FEATURE_NAMES.get(i);
 +          if (joshuaConfiguration.moses)
 +            System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i)));
 +          else
 +            System.out.println(String.format("%s %.5f", name, weights.getDense(i)));
 +        }
 +        System.exit(0);
 +      }
 +
 +      // Sort the TM grammars (needed to do cube pruning)
 +      if (joshuaConfiguration.amortized_sorting) {
 +        LOG.info("Grammar sorting happening lazily on-demand.");
 +      } else {
 +        long pre_sort_time = System.currentTimeMillis();
 +        for (Grammar grammar : this.grammars) {
 +          grammar.sortGrammar(this.featureFunctions);
 +        }
 +        LOG.info("Grammar sorting took {} seconds.",
 +            (System.currentTimeMillis() - pre_sort_time) / 1000);
 +      }
 +
 +      // Create the threads
 +      for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
 +        this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
 +            this.featureFunctions, joshuaConfiguration));
 +      }
 +    } catch (IOException | InterruptedException e) {
 +      LOG.warn(e.getMessage(), e);
 +    }
 +
 +    return this;
 +  }
 +
 +  /**
 +   * Initializes translation grammars Retained for backward compatibility
 +   *
 +   * @param ownersSeen Records which PhraseModelFF's have been instantiated (one is needed for each
 +   *          owner)
 +   * @throws IOException
 +   */
 +  private void initializeTranslationGrammars() throws IOException {
 +
 +    if (joshuaConfiguration.tms.size() > 0) {
 +
 +      // collect packedGrammars to check if they use a shared vocabulary
 +      final List<PackedGrammar> packed_grammars = new ArrayList<>();
 +
 +      // tm = {thrax/hiero,packed,samt,moses} OWNER LIMIT FILE
 +      for (String tmLine : joshuaConfiguration.tms) {
 +
 +        String type = tmLine.substring(0,  tmLine.indexOf(' '));
 +        String[] args = tmLine.substring(tmLine.indexOf(' ')).trim().split("\\s+");
 +        HashMap<String, String> parsedArgs = FeatureFunction.parseArgs(args);
 +
 +        String owner = parsedArgs.get("owner");
 +        int span_limit = Integer.parseInt(parsedArgs.get("maxspan"));
 +        String path = parsedArgs.get("path");
 +
 +        Grammar grammar = null;
 +        if (! type.equals("moses") && ! type.equals("phrase")) {
 +          if (new File(path).isDirectory()) {
 +            try {
 +              PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, joshuaConfiguration);
 +              packed_grammars.add(packed_grammar);
 +              grammar = packed_grammar;
 +            } catch (FileNotFoundException e) {
 +              String msg = String.format("Couldn't load packed grammar from '%s'", path)
 +                  + "Perhaps it doesn't exist, or it may be an old packed file format.";
 +              throw new RuntimeException(e);
 +            }
 +          } else {
 +            // thrax, hiero, samt
 +            grammar = new MemoryBasedBatchGrammar(type, path, owner,
 +                joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
 +          }
 +
 +        } else {
 +
 +          int maxSourceLen = parsedArgs.containsKey("max-source-len")
 +              ? Integer.parseInt(parsedArgs.get("max-source-len"))
 +              : -1;
 +
 +          joshuaConfiguration.search_algorithm = "stack";
 +          grammar = new PhraseTable(path, owner, type, joshuaConfiguration);
 +        }
 +
 +        this.grammars.add(grammar);
 +      }
 +
 +      checkSharedVocabularyChecksumsForPackedGrammars(packed_grammars);
 +
 +    } else {
 +      LOG.warn("no grammars supplied!  Supplying dummy glue grammar.");
 +      MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration);
 +      glueGrammar.setSpanLimit(-1);
 +      glueGrammar.addGlueRules(featureFunctions);
 +      this.grammars.add(glueGrammar);
 +    }
 +    
 +    /* Add the grammar for custom entries */
 +    this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration);
 +    this.grammars.add(this.customPhraseTable);
 +    
 +    /* Create an epsilon-deleting grammar */
 +    if (joshuaConfiguration.lattice_decoding) {
 +      LOG.info("Creating an epsilon-deleting grammar");
 +      MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
 +      latticeGrammar.setSpanLimit(-1);
 +      HieroFormatReader reader = new HieroFormatReader();
 +
 +      String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
 +      String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
 +
 +      //FIXME: too many arguments
 +      String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
 +          goalNT, defaultNT);
 +
 +      Rule rule = reader.parseLine(ruleString);
 +      latticeGrammar.addRule(rule);
 +      rule.estimateRuleCost(featureFunctions);
 +
 +      this.grammars.add(latticeGrammar);
 +    }
 +
 +    /* Now create a feature function for each owner */
 +    HashSet<String> ownersSeen = new HashSet<String>();
 +
 +    for (Grammar grammar: this.grammars) {
 +      String owner = Vocabulary.word(grammar.getOwner());
 +      if (! ownersSeen.contains(owner)) {
 +        this.featureFunctions.add(new PhraseModel(weights, new String[] { "tm", "-owner", owner },
 +            joshuaConfiguration, grammar));
 +        ownersSeen.add(owner);
 +      }
 +    }
 +
 +    LOG.info("Memory used {} MB",
 +        ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
 +  }
 +
 +  /**
 +   * Checks if multiple packedGrammars have the same vocabulary by comparing their vocabulary file checksums.
 +   */
 +  private static void checkSharedVocabularyChecksumsForPackedGrammars(final List<PackedGrammar> packed_grammars) {
 +    String previous_checksum = "";
 +    for (PackedGrammar grammar : packed_grammars) {
 +      final String checksum = grammar.computeVocabularyChecksum();
 +      if (previous_checksum.isEmpty()) {
 +        previous_checksum = checksum;
 +      } else {
 +        if (!checksum.equals(previous_checksum)) {
 +          throw new RuntimeException(
 +              "Trying to load multiple packed grammars with different vocabularies!" +
 +                  "Have you packed them jointly?");
 +        }
 +        previous_checksum = checksum;
 +      }
 +    }
 +  }
 +
 +  /*
 +   * This function reads the weights for the model. Feature names and their weights are listed one
 +   * per line in the following format:
 +   * 
 +   * FEATURE_NAME WEIGHT
 +   */
 +  private void readWeights(String fileName) {
 +    Decoder.weights = new FeatureVector();
 +
 +    if (fileName.equals(""))
 +      return;
 +
 +    try {
 +      LineReader lineReader = new LineReader(fileName);
 +
 +      for (String line : lineReader) {
 +        line = line.replaceAll("\\s+", " ");
 +
 +        if (line.equals("") || line.startsWith("#") || line.startsWith("//")
 +            || line.indexOf(' ') == -1)
 +          continue;
 +
 +        String tokens[] = line.split("\\s+");
 +        String feature = tokens[0];
 +        Float value = Float.parseFloat(tokens[1]);
 +
 +        // Kludge for compatibility with Moses tuners
 +        if (joshuaConfiguration.moses) {
 +          feature = demoses(feature);
 +        }
 +
 +        weights.increment(feature, value);
 +      }
 +    } catch (IOException ioe) {
 +      throw new RuntimeException(ioe);
 +    }
 +    LOG.info("Read {} weights from file '{}'", weights.size(), fileName);
 +  }
 +
 +  private String demoses(String feature) {
 +    if (feature.endsWith("="))
 +      feature = feature.replace("=", "");
 +    if (feature.equals("OOV_Penalty"))
 +      feature = "OOVPenalty";
 +    else if (feature.startsWith("tm-") || feature.startsWith("lm-"))
 +      feature = feature.replace("-",  "_");
 +    return feature;
 +  }
 +
 +  /**
 +   * Feature functions are instantiated with a line of the form
 +   *
 +   * <pre>
-    *   feature_function = FEATURE OPTIONS
++   *   FEATURE OPTIONS
 +   * </pre>
 +   *
 +   * Weights for features are listed separately.
 +   *
 +   * @throws IOException
 +   *
 +   */
 +  private void initializeFeatureFunctions() throws IOException {
 +
 +    for (String featureLine : joshuaConfiguration.features) {
-       // feature-function = NAME args
++      // line starts with NAME, followed by args
 +      // 1. create new class named NAME, pass it config, weights, and the args
 +
-       // Get rid of the leading crap.
-       featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");
- 
 +      String fields[] = featureLine.split("\\s+");
 +      String featureName = fields[0];
++      
 +      try {
++        
 +        Class<?> clas = getClass(featureName);
 +        Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
 +            String[].class, JoshuaConfiguration.class);
-         this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
++        FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
++        this.featureFunctions.add(feature);
++        
 +      } catch (Exception e) {
-         e.printStackTrace();
-         throw new RuntimeException("* FATAL: could not find a feature '" + featureName + "'");
++        throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e); 
 +      }
 +    }
 +
 +    for (FeatureFunction feature : featureFunctions) {
 +      LOG.info("FEATURE: {}", feature.logString());
- 
 +    }
 +
 +    weights.registerDenseFeatures(featureFunctions);
 +  }
 +
 +  /**
 +   * Searches a list of predefined paths for classes, and returns the first one found. Meant for
 +   * instantiating feature functions.
 +   *
 +   * @param name
 +   * @return the class, found in one of the search paths
 +   * @throws ClassNotFoundException
 +   */
 +  private Class<?> getClass(String featureName) {
 +    Class<?> clas = null;
 +
 +    String[] packages = { "org.apache.joshua.decoder.ff", "org.apache.joshua.decoder.ff.lm", "org.apache.joshua.decoder.ff.phrase" };
 +    for (String path : packages) {
 +      try {
 +        clas = Class.forName(String.format("%s.%s", path, featureName));
 +        break;
 +      } catch (ClassNotFoundException e) {
 +        try {
 +          clas = Class.forName(String.format("%s.%sFF", path, featureName));
 +          break;
 +        } catch (ClassNotFoundException e2) {
 +          // do nothing
 +        }
 +      }
 +    }
 +    return clas;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
index 5acfd7e,0000000..dd7bafb
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
@@@ -1,712 -1,0 +1,712 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
 +import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
 +import static org.apache.joshua.util.FormatUtils.ensureNonTerminalBrackets;
 +
 +import java.io.File;
 +import java.io.FileWriter;
 +import java.io.IOException;
 +import java.io.PrintWriter;
 +import java.io.BufferedReader;
 +import java.io.FileReader;
 +import java.util.ArrayList;
 +import java.util.Collections;
 +
 +import org.apache.joshua.decoder.ff.StatefulFF;
 +import org.apache.joshua.decoder.ff.fragmentlm.Tree;
 +import org.apache.joshua.util.FormatUtils;
 +import org.apache.joshua.util.Regex;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * Configuration file for Joshua decoder.
 + *
 + * When adding new features to Joshua, any new configurable parameters should be added to this
 + * class.
 + *
 + * @author Zhifei Li, zhifei.work@gmail.com
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class JoshuaConfiguration {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(JoshuaConfiguration.class);
 +
 +  // whether to construct a StructuredTranslation object for each request instead of
 +  // printing to stdout. Used when the Decoder is used from Java directly.
 +  public Boolean use_structured_output = false;
 +
 +  // If set to true, Joshua will lowercase the input, creating an annotation that marks the
 +  // original case
 +  public boolean lowercase = false;
 +
 +  // If set to true, Joshua will recapitalize the output by projecting the case from aligned
 +  // source-side words
 +  public boolean project_case = false;
 +
 +  // List of grammar files to read
 +  public ArrayList<String> tms = new ArrayList<String>();
 +
 +  // A rule cache for commonly used tries to avoid excess object allocations
 +  // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
 +  public Integer cachedRuleSize = new Integer(5000);
 +
 +  /*
 +   * The file to read the weights from (part of the sparse features implementation). Weights can
 +   * also just be listed in the main config file.
 +   */
 +  public String weights_file = "";
 +  // Default symbols. The symbol here should be enclosed in square brackets.
 +  public String default_non_terminal = FormatUtils.ensureNonTerminalBrackets("X");
 +  public String goal_symbol = FormatUtils.ensureNonTerminalBrackets("GOAL");
 +
 +  /*
 +   * A list of OOV symbols in the form
 +   *
 +   * [X1] weight [X2] weight [X3] weight ...
 +   *
 +   * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
 +   * input sentence, Joshua will create rules of the form
 +   *
 +   * X1 -> w (weight)
 +   *
 +   * If this is empty, an unweighted default_non_terminal is used.
 +   */
 +  public class OOVItem implements Comparable<OOVItem> {
 +    public String label;
 +
 +    public float weight;
 +
 +    OOVItem(String l, float w) {
 +      label = l;
 +      weight = w;
 +    }
 +    @Override
 +    public int compareTo(OOVItem other) {
 +      if (weight > other.weight)
 +        return -1;
 +      else if (weight < other.weight)
 +        return 1;
 +      return 0;
 +    }
 +  }
 +
 +  public ArrayList<OOVItem> oovList = null;
 +
 +  /*
 +   * Whether to segment OOVs into a lattice
 +   */
 +  public boolean segment_oovs = false;
 +
 +  /*
 +   * Enable lattice decoding.
 +   */
 +  public boolean lattice_decoding = false;
 +
 +  /*
 +   * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
 +   * sorted till they are first accessed. Amortized sorting means you get your first translation
 +   * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
 +   */
 +  public boolean amortized_sorting = true;
 +  // syntax-constrained decoding
 +  public boolean constrain_parse = false;
 +
 +  public boolean use_pos_labels = false;
 +
 +  // oov-specific
 +  public boolean true_oovs_only = false;
 +
 +  /* Dynamic sentence-level filtering. */
 +  public boolean filter_grammar = false;
 +
 +  /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
 +  public int pop_limit = 100;
 +
 +  /* Maximum sentence length. Sentences longer than this are truncated. */
 +  public int maxlen = 200;
 +
 +  /*
 +   * N-best configuration.
 +   */
 +  // Make sure output strings in the n-best list are unique.
 +  public boolean use_unique_nbest = true;
 +
 +  /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
 +  public boolean include_align_index = false;
 +
 +  /* The number of hypotheses to output by default. */
 +  public int topN = 1;
 +
 +  /**
 +   * This string describes the format of each line of output from the decoder (i.e., the
 +   * translations). The string can include arbitrary text and also variables. The following
 +   * variables are available:
 +   *
 +   * <pre>
 +   * - %i the 0-indexed sentence number
 +   * - %e the source string %s the translated sentence
 +   * - %S the translated sentence with some basic capitalization and denormalization
 +   * - %t the synchronous derivation
 +   * - %f the list of feature values (as name=value pairs)
 +   * - %c the model cost
 +   * - %w the weight vector
 +   * - %a the alignments between source and target words (currently unimplemented)
 +   * - %d a verbose, many-line version of the derivation
 +   * </pre>
 +   */
 +  public String outputFormat = "%i ||| %s ||| %f ||| %c";
 +
 +  /* The number of decoding threads to use (-threads). */
 +  public int num_parallel_decoders = 1;
 +
 +  // disk hg
 +  public String hypergraphFilePattern = "";
 +
 +  /*
 +   * When true, _OOV is appended to all words that are passed through (useful for something like
 +   * transliteration on the target side
 +   */
 +  public boolean mark_oovs = false;
 +
 +  /* Enables synchronous parsing. */
 +  public boolean parse = false; // perform synchronous parsing
 +
 +
 +  /* A list of the feature functions. */
 +  public ArrayList<String> features = new ArrayList<String>();
 +
 +  /* A list of weights found in the main config file (instead of in a separate weights file) */
 +  public ArrayList<String> weights = new ArrayList<String>();
 +
 +  /* Determines whether to expect JSON input or plain lines */
 +  public enum INPUT_TYPE { plain, json };
 +  public INPUT_TYPE input_type = INPUT_TYPE.plain;
 +
 +  /* Type of server. Not sure we need to keep the regular TCP one around. */
 +  public enum SERVER_TYPE { none, TCP, HTTP };
 +  public SERVER_TYPE server_type = SERVER_TYPE.TCP;
 +
 +  /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
 +  public int server_port = 0;
 +
 +  /*
 +   * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
 +   * the input sentences in the following format:
 +   * 
 +   * input sentence ||| ||| reference1 ||| reference2 ...
 +   * 
 +   * (The second field is reserved for the output sentence for alignment and forced decoding).
 +   */
 +
 +  public boolean rescoreForest = false;
 +  public float rescoreForestWeight = 10.0f;
 +
 +  /*
 +   * Location of fragment mapping file, which maps flattened SCFG rules to their internal
 +   * representation.
 +   */
 +  public String fragmentMapFile = null;
 +
 +  /*
 +   * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
 +   * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
 +   */
 +  public boolean fuzzy_matching = false;
 +
 +  public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
 +
 +  /***
 +   * Phrase-based decoding parameters.
 +   */
 +  
 +  /* The search algorithm: currently either "cky" or "stack" */
 +  public String search_algorithm = "cky";
 +
 +  /* The distortion limit */
 +  public int reordering_limit = 8;
 +
 +  /* The number of target sides considered for each source side (after sorting by model weight) */
 +  public int num_translation_options = 20;
 +
 +  /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
 +   * version of Sennrich (SSST 2014)
 +   */
 +  public boolean use_dot_chart = true;
 +
 +  /* Moses compatibility */
 +  public boolean moses = false;
 +
 +  /* If true, just print out the weights found in the config file, and exit. */
 +  public boolean show_weights_and_quit = false;
 +
 +  /* Read input from a file (Moses compatible flag) */
 +  public String input_file = null;
 +
 +  /* Write n-best output to this file */
 +  public String n_best_file = null;
 +
 +  /* Whether to look at source side for special annotations */
 +  public boolean source_annotations = false;
 +
 +  /* Weights overridden from the command line */
 +  public String weight_overwrite = "";
 +
 +  /**
 +   * This method resets the state of JoshuaConfiguration back to the state after initialization.
 +   * This is useful when for example making different calls to the decoder within the same java
 +   * program, which otherwise leads to potential errors due to inconsistent state as a result of
 +   * loading the configuration multiple times without resetting etc.
 +   *
 +   * This leads to the insight that in fact it may be an even better idea to refactor the code and
 +   * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
 +   * shared static object. This is just a suggestion for the next step.
 +   *
 +   */
 +  public void reset() {
 +    LOG.info("Resetting the JoshuaConfiguration to its defaults ...");
 +    LOG.info("\n\tResetting the StatefullFF global state index ...");
 +    LOG.info("\n\t...done");
 +    StatefulFF.resetGlobalStateIndex();
 +    tms = new ArrayList<String>();
 +    weights_file = "";
 +    default_non_terminal = "[X]";
 +    oovList = new ArrayList<OOVItem>();
 +    oovList.add(new OOVItem(default_non_terminal, 1.0f));
 +    goal_symbol = "[GOAL]";
 +    amortized_sorting = true;
 +    constrain_parse = false;
 +    use_pos_labels = false;
 +    true_oovs_only = false;
 +    filter_grammar = false;
 +    pop_limit = 100;
 +    maxlen = 200;
 +    use_unique_nbest = false;
 +    include_align_index = false;
 +    topN = 1;
 +    outputFormat = "%i ||| %s ||| %f ||| %c";
 +    num_parallel_decoders = 1;
 +    hypergraphFilePattern = "";
 +    mark_oovs = false;
 +    // oracleFile = null;
 +    parse = false; // perform synchronous parsing
 +    features = new ArrayList<String>();
 +    weights = new ArrayList<String>();
 +    server_port = 0;
 +
 +    reordering_limit = 8;
 +    num_translation_options = 20;
 +    LOG.info("...done");
 +  }
 +
 +  // ===============================================================
 +  // Methods
 +  // ===============================================================
 +
 +  /**
 +   * To process command-line options, we write them to a file that looks like the config file, and
 +   * then call readConfigFile() on it. It would be more general to define a class that sits on a
 +   * stream and knows how to chop it up, but this was quicker to implement.
 +   * 
 +   * @param options string array of command line options
 +   */
 +  public void processCommandLineOptions(String[] options) {
 +    try {
 +      File tmpFile = File.createTempFile("options", null, null);
 +      PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
 +
 +      for (int i = 0; i < options.length; i++) {
 +        String key = options[i].substring(1);
 +        if (i + 1 == options.length || options[i + 1].startsWith("-")) {
 +          // if this is the last item, or if the next item
 +          // is another flag, then this is a boolean flag
 +          out.println(key + " = true");
 +
 +        } else {
 +          out.print(key + " =");
 +          while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
 +            out.print(String.format(" %s", options[i + 1]));
 +            i++;
 +          }
 +          out.println();
 +        }
 +      }
 +      out.close();
 +      this.readConfigFile(tmpFile.getCanonicalPath());
 +
 +      tmpFile.delete();
 +
 +    } catch (IOException e) {
 +      throw new RuntimeException(e);
 +    }
 +  }
 +
 +  public void readConfigFile(String configFile) throws IOException {
 +
 +    LineReader configReader = new LineReader(configFile, false);
 +    try {
 +      for (String line : configReader) {
 +        line = line.trim(); // .toLowerCase();
 +
 +        if (Regex.commentOrEmptyLine.matches(line))
 +          continue;
 +
 +        /*
 +         * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
 +         * values. Parameters match the pattern "key = value"; all other substantive lines are
 +         * interpreted as features.
 +         */
 +
 +        if (line.indexOf("=") != -1) { // parameters; (not feature function)
 +          String[] fds = Regex.equalsWithSpaces.split(line, 2);
 +          if (fds.length < 2) {
 +            LOG.warn("skipping config file line '{}'", line);
 +            continue;
 +          }
 +
 +          String parameter = normalize_key(fds[0]);
 +
 +          if (parameter.equals(normalize_key("lm"))) {
 +            /* This is deprecated. This support old LM lines of the form
 +             * 
 +             *   lm = berkeleylm 5 false false 100 lm.gz
 +             * 
 +             * LMs are now loaded as general feature functions, so we transform that to either
 +             * 
-              *   feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
++             *   LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
 +             * 
 +             * If the line were state minimizing:
 +             * 
 +             *   lm = kenlm 5 true false 100 lm.gz
 +             *              
-              * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
++             * StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
 +             */
 +
 +            String[] tokens = fds[1].split("\\s+");
 +            if (tokens[2].equals("true"))
-               features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
++              features.add(String.format("StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
 +                  tokens[1], tokens[5]));
 +            else
-               features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
++              features.add(String.format("LanguageModel -lm_type %s -lm_order %s -lm_file %s",
 +                  tokens[0], tokens[1], tokens[5]));
 +
 +          } else if (parameter.equals(normalize_key("tm"))) {
 +            /* If found, convert old format:
 +             *   tm = TYPE OWNER MAXSPAN PATH
 +             * to new format
 +             *   tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH    
 +             */
 +            String tmLine = fds[1];
 +
 +            String[] tokens = fds[1].split("\\s+");
 +            if (! tokens[1].startsWith("-")) { // old format
 +              tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
 +              LOG.warn("Converting deprecated TM line from '{}' -> '{}'", fds[1], tmLine);
 +            }
 +            tms.add(tmLine);
 +
 +          } else if (parameter.equals("v")) {
 +            Decoder.VERBOSE = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("parse"))) {
 +            parse = Boolean.parseBoolean(fds[1]);
 +            LOG.debug("parse: {}", parse);
 +
 +          } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
 +            hypergraphFilePattern = fds[1].trim();
 +            LOG.debug("  hypergraph dump file format: {}", hypergraphFilePattern);
 +
 +          } else if (parameter.equals(normalize_key("oov-list"))) {
 +            if (new File(fds[1]).exists()) {
 +              oovList = new ArrayList<OOVItem>();
 +              try {
 +                File file = new File(fds[1]);
 +                BufferedReader br = new BufferedReader(new FileReader(file));
 +                try {
 +                  String str = br.readLine();
 +                  while (str != null) {
 +                    String[] tokens = str.trim().split("\\s+");
 +
 +                    oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[0]),
 +                            (float) Math.log(Float.parseFloat(tokens[1]))));
 +
 +                    str = br.readLine();
 +                  }
 +                  br.close();
 +                } catch(IOException e){
 +                  System.out.println(e);
 +                }
 +              } catch(IOException e){
 +                System.out.println(e);
 +              }
 +              Collections.sort(oovList);
 +
 +            } else {
 +              String[] tokens = fds[1].trim().split("\\s+");
 +              if (tokens.length % 2 != 0) {
 +                throw new RuntimeException(String.format("* FATAL: invalid format for '%s'", fds[0]));
 +              }
 +              oovList = new ArrayList<OOVItem>();
 +
 +              for (int i = 0; i < tokens.length; i += 2)
 +                oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[i]),
 +                    (float) Math.log(Float.parseFloat(tokens[i + 1]))));
 +
 +              Collections.sort(oovList);
 +            }
 +
 +          } else if (parameter.equals(normalize_key("lattice-decoding"))) {
 +            lattice_decoding = true;
 +
 +          } else if (parameter.equals(normalize_key("segment-oovs"))) {
 +            segment_oovs = true;
 +            lattice_decoding = true;
 +
 +          } else if (parameter.equals(normalize_key("default-non-terminal"))) {
 +            default_non_terminal = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
 +            LOG.debug("default_non_terminal: {}", default_non_terminal);
 +
 +          } else if (parameter.equals(normalize_key("goal-symbol"))) {
 +            goal_symbol = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
 +            LOG.debug("goalSymbol: {}", goal_symbol);
 +
 +          } else if (parameter.equals(normalize_key("weights-file"))) {
 +            weights_file = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("constrain_parse"))) {
 +            constrain_parse = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("true_oovs_only"))) {
 +            true_oovs_only = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("filter-grammar"))) {
 +            filter_grammar = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("amortize"))) {
 +            amortized_sorting = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("use_pos_labels"))) {
 +            use_pos_labels = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
 +            use_unique_nbest = Boolean.valueOf(fds[1]);
 +            LOG.debug("use_unique_nbest: {}", use_unique_nbest);
 +
 +          } else if (parameter.equals(normalize_key("output-format"))) {
 +            outputFormat = fds[1];
 +            LOG.debug("output-format: {}", outputFormat);
 +
 +          } else if (parameter.equals(normalize_key("include_align_index"))) {
 +            include_align_index = Boolean.valueOf(fds[1]);
 +            LOG.debug("include_align_index: {}", include_align_index);
 +
 +          } else if (parameter.equals(normalize_key("top_n"))) {
 +            topN = Integer.parseInt(fds[1]);
 +            LOG.debug("topN: {}", topN);
 +
 +          } else if (parameter.equals(normalize_key("num_parallel_decoders"))
 +              || parameter.equals(normalize_key("threads"))) {
 +            num_parallel_decoders = Integer.parseInt(fds[1]);
 +            if (num_parallel_decoders <= 0) {
 +              throw new IllegalArgumentException(
 +                  "Must specify a positive number for num_parallel_decoders");
 +            }
 +            LOG.debug("num_parallel_decoders: {}", num_parallel_decoders);
 +
 +          } else if (parameter.equals(normalize_key("mark_oovs"))) {
 +            mark_oovs = Boolean.valueOf(fds[1]);
 +            LOG.debug("mark_oovs: {}", mark_oovs);
 +
 +          } else if (parameter.equals(normalize_key("pop-limit"))) {
 +            pop_limit = Integer.parseInt(fds[1]);
 +            LOG.info("pop-limit: {}", pop_limit);
 +
 +          } else if (parameter.equals(normalize_key("input-type"))) {
 +            if (fds[1].equals("json")) {
 +              input_type = INPUT_TYPE.json;
 +            } else if (fds[1].equals("plain")) {
 +              input_type = INPUT_TYPE.plain;
 +            } else {
 +              throw new RuntimeException(String.format("* FATAL: invalid server type '%s'", fds[1]));
 +            }
 +            LOG.info("    input-type: {}", input_type);
 +
 +          } else if (parameter.equals(normalize_key("server-type"))) {
 +            if (fds[1].toLowerCase().equals("tcp"))
 +              server_type = SERVER_TYPE.TCP;
 +            else if (fds[1].toLowerCase().equals("http"))
 +              server_type = SERVER_TYPE.HTTP;
 +
 +            LOG.info("    server-type: {}", server_type);
 +
 +          } else if (parameter.equals(normalize_key("server-port"))) {
 +            server_port = Integer.parseInt(fds[1]);
 +            LOG.info("    server-port: {}", server_port);
 +
 +          } else if (parameter.equals(normalize_key("rescore-forest"))) {
 +            rescoreForest = true;
 +            LOG.info("    rescore-forest: {}", rescoreForest);
 +
 +          } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
 +            rescoreForestWeight = Float.parseFloat(fds[1]);
 +            LOG.info("    rescore-forest-weight: {}", rescoreForestWeight);
 +
 +          } else if (parameter.equals(normalize_key("maxlen"))) {
 +            // reset the maximum length
 +            maxlen = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals("c") || parameter.equals("config")) {
 +            // this was used to send in the config file, just ignore it
 +            ;
 +
 +          } else if (parameter.equals(normalize_key("feature-function"))) {
 +            // add the feature to the list of features for later processing
-             features.add("feature_function = " + fds[1]);
++            features.add(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("maxlen"))) {
 +            // add the feature to the list of features for later processing
 +            maxlen = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter
 +              .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
 +            fuzzy_matching = Boolean.parseBoolean(fds[1]);
 +            LOG.debug("fuzzy_matching: {}", fuzzy_matching);
 +
 +          } else if (parameter.equals(normalize_key("fragment-map"))) {
 +            fragmentMapFile = fds[1];
 +            Tree.readMapping(fragmentMapFile);
 +
 +            /** PHRASE-BASED PARAMETERS **/
 +          } else if (parameter.equals(normalize_key("search"))) {
 +            search_algorithm = fds[1];
 +
 +            if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
 +              throw new RuntimeException(
 +                  "-search must be one of 'stack' (for phrase-based decoding) " +
 +                      "or 'cky' (for hierarchical / syntactic decoding)");
 +            }
 +
 +            if (search_algorithm.equals("cky") && include_align_index) {
 +              throw new RuntimeException(
 +                  "include_align_index is currently not supported with cky search");
 +            }
 +
 +          } else if (parameter.equals(normalize_key("reordering-limit"))) {
 +            reordering_limit = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("num-translation-options"))) {
 +            num_translation_options = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("no-dot-chart"))) {
 +            use_dot_chart = false;
 +
 +          } else if (parameter.equals(normalize_key("moses"))) {
 +            moses = true; // triggers some Moses-specific compatibility options
 +
 +          } else if (parameter.equals(normalize_key("show-weights"))) {
 +            show_weights_and_quit = true;
 +
 +          } else if (parameter.equals(normalize_key("n-best-list"))) {
 +            // for Moses compatibility
 +            String[] tokens = fds[1].split("\\s+");
 +            n_best_file = tokens[0];
 +            if (tokens.length > 1)
 +              topN = Integer.parseInt(tokens[1]);
 +
 +          } else if (parameter.equals(normalize_key("input-file"))) {
 +            // for Moses compatibility
 +            input_file = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("weight-file"))) {
 +            // for Moses, ignore
 +
 +          } else if (parameter.equals(normalize_key("weight-overwrite"))) {
 +            weight_overwrite = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("source-annotations"))) {
 +            // Check source sentence
 +            source_annotations = true;
 +
 +          } else if (parameter.equals(normalize_key("cached-rules-size"))) {
 +            // Check source sentence
 +            cachedRuleSize = Integer.parseInt(fds[1]);
 +          } else if (parameter.equals(normalize_key("lowercase"))) {
 +            lowercase = true;
 +
 +          } else if (parameter.equals(normalize_key("project-case"))) {
 +            project_case = true;
 +
 +          } else {
 +
 +            if (parameter.equals(normalize_key("use-sent-specific-tm"))
 +                || parameter.equals(normalize_key("add-combined-cost"))
 +                || parameter.equals(normalize_key("use-tree-nbest"))
 +                || parameter.equals(normalize_key("use-kenlm"))
 +                || parameter.equals(normalize_key("useCubePrune"))
 +                || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
 +                || parameter.equals(normalize_key("regexp-grammar"))) {
 +              LOG.warn("ignoring deprecated parameter '{}'", fds[0]);
 +
 +            } else {
 +              throw new RuntimeException("FATAL: unknown configuration parameter '" + fds[0] + "'");
 +            }
 +          }
 +
 +          LOG.info("    {} = '{}'", normalize_key(fds[0]), fds[1]);
 +
 +        } else {
 +          /*
 +           * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
 +           * are feature values, which can be present in this file
 +           */
 +
 +          weights.add(line);
 +        }
 +      }
 +    } finally {
 +      configReader.close();
 +    }
 +  }
 +
 +  /**
 +   * Checks for invalid variable configurations
 +   */
 +  public void sanityCheck() {
 +  }
 +
 +  /**
 +   * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
 +   * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
 +   * camelCasing in paramter names without forcing the user to memorize them all. Here are some
 +   * examples of equivalent ways to refer to parameter names:
 +   * <pre>
 +   * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
 +   * </pre>
 +   * 
 +   * @param text the string to be normalized
 +   * @return normalized key
 +   * 
 +   */
 +  public static String normalize_key(String text) {
 +    return text.replaceAll("[-_]", "").toLowerCase();
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5c0d5388/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 69584dd,0000000..e53e19f
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@@ -1,117 -1,0 +1,118 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff;
 +
 +import java.util.ArrayList;
 +import java.util.HashMap;
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.chart_parser.SourcePath;
 +
 +/**
 + * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
 + * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
 + * with respect to the translation model, we create a rule that pushes that word through
 + * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
 + * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
 + * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
 + * 
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class OOVPenalty extends StatelessFF {
-   private int ownerID = -1;
++  private final int ownerID;
 +  
 +  /* The default value returned for OOVs. Can be overridden with -oov-list */
-   private float defaultValue = -100f;
-   private HashMap<Integer,Float> oovWeights = null;
++  private final float defaultValue = -100f;
++  private final HashMap<Integer,Float> oovWeights;
 +
 +  public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
 +    super(weights, "OOVPenalty", args, config);
 +
 +    ownerID = Vocabulary.id("oov");
 +    oovWeights = new HashMap<Integer,Float>();
 +    
-     if (config.oovList != null)
-       for (OOVItem item: config.oovList) 
++    if (config.oovList != null) {
++      for (OOVItem item: config.oovList) { 
 +        oovWeights.put(Vocabulary.id(item.label), item.weight);
++      }
++    }
 +  }
 +  
 +  @Override
 +  public ArrayList<String> reportDenseFeatures(int index) {
 +    denseFeatureIndex = index;
 +    
-     ArrayList<String> names = new ArrayList<String>();
++    ArrayList<String> names = new ArrayList<>(1);
 +    names.add(name);
 +    return names;
 +  }
 +
 +  /**
 +   * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
 +   * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
 +   * cached when the feature was created.
 +   */
 +  @Override
 +  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
 +      Sentence sentence, Accumulator acc) {
 +    
 +    if (rule != null && this.ownerID == rule.getOwner()) {
- //      acc.add(name, getValue(rule.getLHS()));
 +      acc.add(denseFeatureIndex, getValue(rule.getLHS()));
 +    }
 +
 +    return null;
 +  }
 +  
 +  /**
 +   * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
 +   * rules (which are added for all words, not just ones without translation options) get sorted
 +   * to the bottom during cube pruning.
 +   * 
 +   * Important! estimateCost returns the *weighted* feature value.
 +   */
 +  @Override
 +  public float estimateCost(Rule rule, Sentence sentence) {
 +    if (rule != null && this.ownerID == rule.getOwner())
 +      return weights.getDense(denseFeatureIndex) * getValue(rule.getLHS());
 +    return 0.0f;
 +  }
 +  
 +  private float getValue(int lhs) {
 +    return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
 +  }
 +
 +  @Override
 +  public double estimateLogP(Rule rule, int sentID) {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +
 +  @Override
 +  public double getWeight() {
 +    // TODO Auto-generated method stub
 +    return 0;
 +  }
 +}


[5/5] incubator-joshua git commit: Merge branch 'JOSHUA-PR21' into JOSHUA-252

Posted by mj...@apache.org.
Merge branch 'JOSHUA-PR21' into JOSHUA-252


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8793c45d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8793c45d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8793c45d

Branch: refs/heads/JOSHUA-252
Commit: 8793c45d783c09db89c775536029092a8d322083
Parents: 9e70266 5c0d538
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 15:39:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 15:39:13 2016 -0400

----------------------------------------------------------------------
 lib/ivy.xml                                     |  17 +++
 src/joshua/decoder/ff/LexicalFeatures.java      | 131 +++++++++++++++++++
 .../org/apache/joshua/corpus/Vocabulary.java    |  13 +-
 .../java/org/apache/joshua/decoder/Decoder.java |  17 ++-
 .../joshua/decoder/JoshuaConfiguration.java     |  10 +-
 .../apache/joshua/decoder/ff/OOVPenalty.java    |  15 ++-
 .../org/apache/joshua/decoder/ff/RuleFF.java    | 109 +++++++++------
 .../apache/joshua/decoder/ff/RuleLength.java    |  13 +-
 .../org/apache/joshua/decoder/ff/RuleShape.java |  67 +++++++---
 .../apache/joshua/decoder/ff/WordPenalty.java   |  10 +-
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |   2 +-
 .../system/MultithreadedTranslationTests.java   |   2 +-
 .../system/StructuredTranslationTest.java       |   2 +-
 13 files changed, 314 insertions(+), 94 deletions(-)
----------------------------------------------------------------------