You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/01 02:52:07 UTC
[72/94] [abbrv] incubator-joshua git commit: Added Sparse lexical feature function. Revised various other sparse feature functions to avoid String formatting. Expensive feature functions now use an LRU cache to avoid re-calculation of feature hashes for

Added Sparse lexical feature function. Revised various other sparse feature functions to avoid String formatting. Expensive feature functions now use an LRU cache to avoid re-calculation of feature hashes for commonly used rules. Also cleaned up the feature string parsing a little bit.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/25a92cbc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/25a92cbc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/25a92cbc

Branch: refs/heads/master
Commit: 25a92cbca7c3a11c1d99c3e71686aea9874e0133
Parents: fadc285
Author: Felix Hieber <fh...@amazon.com>
Authored: Sat Apr 30 09:35:10 2016 -0700
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 11:44:51 2016 +0200

----------------------------------------------------------------------
 src/joshua/corpus/Vocabulary.java               |  13 +-
 src/joshua/decoder/Decoder.java                 |  29 ++--
 src/joshua/decoder/JoshuaConfiguration.java     |  10 +-
 src/joshua/decoder/ff/LexicalFeatures.java      | 131 +++++++++++++++++++
 src/joshua/decoder/ff/OOVPenalty.java           |  15 ++-
 src/joshua/decoder/ff/RuleFF.java               | 110 ++++++++++------
 src/joshua/decoder/ff/RuleLength.java           |  13 +-
 src/joshua/decoder/ff/RuleShape.java            |  66 +++++++---
 src/joshua/decoder/ff/WordPenalty.java          |  10 +-
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |   2 +-
 .../system/MultithreadedTranslationTests.java   |   2 +-
 .../system/StructuredTranslationTest.java       |   2 +-
 12 files changed, 301 insertions(+), 102 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index 74f6a47..2193629 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -205,10 +205,17 @@ public class Vocabulary {
   }
 
   public static String getWords(int[] ids) {
-    if (ids.length == 0) return "";
+    return getWords(ids, " ");
+  }
+  
+  public static String getWords(int[] ids, final String separator) {
+    if (ids.length == 0) {
+      return "";
+    }
     StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < ids.length - 1; i++)
-      sb.append(word(ids[i])).append(" ");
+    for (int i = 0; i < ids.length - 1; i++) {
+      sb.append(word(ids[i])).append(separator);
+    }
     return sb.append(word(ids[ids.length - 1])).toString();
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 22ed8b9..97ac9aa 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -20,7 +20,7 @@ package joshua.decoder;
 
 import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
 
-import java.io.BufferedWriter;	
+import java.io.BufferedWriter;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
@@ -34,8 +34,6 @@ import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 
-import com.google.common.base.Strings;
-
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
@@ -59,6 +57,8 @@ import joshua.util.FormatUtils;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;
 
+import com.google.common.base.Strings;
+
 /**
  * This class handles decoder initialization and the complication introduced by multithreading.
  * 
@@ -914,7 +914,7 @@ public class Decoder {
    * Feature functions are instantiated with a line of the form
    * 
    * <pre>
-   *   feature_function = FEATURE OPTIONS
+   *   FEATURE OPTIONS
    * </pre>
    * 
    * Weights for features are listed separately.
@@ -926,31 +926,26 @@ public class Decoder {
   private void initializeFeatureFunctions() throws IOException {
 
     for (String featureLine : joshuaConfiguration.features) {
-      // feature-function = NAME args
+      // line starts with NAME, followed by args
       // 1. create new class named NAME, pass it config, weights, and the args
 
-      // Get rid of the leading crap.
-      featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");
-
       String fields[] = featureLine.split("\\s+");
       String featureName = fields[0];
+      
       try {
+        
         Class<?> clas = getClass(featureName);
         Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
             String[].class, JoshuaConfiguration.class);
-        this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
+        FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
+        Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
+        this.featureFunctions.add(feature);
+        
       } catch (Exception e) {
-        e.printStackTrace();
-        System.err.println("* FATAL: could not find a feature '" + featureName + "'");
-        System.exit(1);
+        throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e); 
       }
     }
 
-    for (FeatureFunction feature : featureFunctions) {
-      Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));
-      
-    }
-
     weights.registerDenseFeatures(featureFunctions);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c874534..05197e5 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -390,21 +390,21 @@ public class JoshuaConfiguration {
              * 
              * LMs are now loaded as general feature functions, so we transform that to either
              * 
-             *   feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+             *   LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
              * 
              * If the line were state minimizing:
              * 
              *   lm = kenlm 5 true false 100 lm.gz
              *              
-             * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+             * StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
              */
             
             String[] tokens = fds[1].split("\\s+");
             if (tokens[2].equals("true"))
-              features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+              features.add(String.format("StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
                   tokens[1], tokens[5]));
             else
-              features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+              features.add(String.format("LanguageModel -lm_type %s -lm_order %s -lm_file %s",
                   tokens[0], tokens[1], tokens[5]));
 
           } else if (parameter.equals(normalize_key("tm"))) {
@@ -582,7 +582,7 @@ public class JoshuaConfiguration {
 
           } else if (parameter.equals(normalize_key("feature-function"))) {
             // add the feature to the list of features for later processing
-            features.add("feature_function = " + fds[1]);
+            features.add(fds[1]);
 
           } else if (parameter.equals(normalize_key("maxlen"))) {
             // add the feature to the list of features for later processing

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/LexicalFeatures.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalFeatures.java b/src/joshua/decoder/ff/LexicalFeatures.java
new file mode 100644
index 0000000..128df87
--- /dev/null
+++ b/src/joshua/decoder/ff/LexicalFeatures.java
@@ -0,0 +1,131 @@
+package joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.chart_parser.SourcePath;
+import joshua.decoder.ff.state_maintenance.DPState;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.HGNode;
+import joshua.decoder.segment_file.Sentence;
+
+import com.google.common.cache.Cache;
+
+/**
+ *  Lexical alignment features denoting alignments, deletions, and insertions.
+ */
+public class LexicalFeatures extends StatelessFF {
+  
+  private final boolean useAlignments;
+  private final boolean useDeletions;
+  private final boolean useInsertions;
+  
+  private static final String NAME = "LexicalFeatures";
+  // value to fire for features
+  private static final int VALUE = 1;
+  //whether this feature is restricted to a certain grammar/owner
+  private final boolean ownerRestriction;
+  // the grammar/owner this feature is restricted to fire
+  private final int owner;
+  // Strings separating words
+  private static final String SEPARATOR = "~";
+  
+  private final Cache<Rule, List<String>> featureCache;
+  
+  public LexicalFeatures(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+    super(weights, NAME, args, config);
+    
+    ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+    
+    useAlignments = parsedArgs.containsKey("alignments");
+    useDeletions = parsedArgs.containsKey("deletions");
+    useInsertions = parsedArgs.containsKey("insertions");
+    
+    // initialize cache
+    if (parsedArgs.containsKey("cacheSize")) {
+      featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+    } else {
+      featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+    }
+  }
+
+  @Override
+  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+      Sentence sentence, Accumulator acc) {
+    
+    if (ownerRestriction && rule.getOwner() != owner) {
+      return null;
+    }
+
+    List<String> featureNames = featureCache.getIfPresent(rule);
+    if (featureNames == null) {
+      featureNames = getFeatures(rule);
+      featureCache.put(rule, featureNames);
+    }
+    for (String feature : featureNames) {
+      acc.add(feature, VALUE);
+    }
+    
+    return null;
+  }
+  
+  /**
+   * Obtains the feature ids for the given rule.
+   * @param rule
+   * @return String representing the feature name.s
+   */
+  private List<String> getFeatures(final Rule rule) {
+    final List<String> result = new ArrayList<>();
+    
+    byte[] alignments = rule.getAlignment();
+    if (alignments == null) {
+      return result;
+    }
+    int[] sourceWords = rule.getFrench();
+    int[] targetWords = rule.getEnglish();
+    
+    // sourceAligned & targetAligned indicate whether an index is covered by alignments
+    boolean[] sourceAligned = new boolean[sourceWords.length];
+    boolean[] targetAligned = new boolean[targetWords.length];
+    
+    // translations: aligned words
+    for (int i = 0; i < alignments.length; i+=2) {
+      byte sourceIndex = alignments[i];
+      byte targetIndex = alignments[i + 1];
+      sourceAligned[sourceIndex] = true;
+      targetAligned[targetIndex] = true;
+      if (useAlignments) {
+        result.add(
+            "T:" + 
+            Vocabulary.word(sourceWords[sourceIndex]) + 
+            SEPARATOR + 
+            Vocabulary.word(targetWords[targetIndex]));
+      }
+    }
+    
+    // deletions: unaligned source words
+    if (useDeletions) {
+      for (int i = 0; i < sourceAligned.length; i++) {
+        if (!sourceAligned[i] && !Vocabulary.nt(sourceWords[i])) {
+          result.add("D:" + Vocabulary.word(sourceWords[i]));
+        }
+      }
+    }
+    
+    // insertions: unaligned target words
+    if (useInsertions) {
+      for (int i = 0; i < targetAligned.length; i++) {
+        if (useInsertions && !targetAligned[i] && !Vocabulary.nt(targetWords[i])) {
+          result.add("I:" + Vocabulary.word(targetWords[i]));
+        }
+      }
+    }
+    
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/OOVPenalty.java b/src/joshua/decoder/ff/OOVPenalty.java
index 6a06548..47a83ef 100644
--- a/src/joshua/decoder/ff/OOVPenalty.java
+++ b/src/joshua/decoder/ff/OOVPenalty.java
@@ -42,11 +42,11 @@ import joshua.decoder.chart_parser.SourcePath;
  * @author Matt Post <po...@cs.jhu.edu>
  */
 public class OOVPenalty extends StatelessFF {
-  private int ownerID = -1;
+  private final int ownerID;
   
   /* The default value returned for OOVs. Can be overridden with -oov-list */
-  private float defaultValue = -100f;
-  private HashMap<Integer,Float> oovWeights = null;
+  private final float defaultValue = -100f;
+  private final HashMap<Integer,Float> oovWeights;
 
   public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "OOVPenalty", args, config);
@@ -54,16 +54,18 @@ public class OOVPenalty extends StatelessFF {
     ownerID = Vocabulary.id("oov");
     oovWeights = new HashMap<Integer,Float>();
     
-    if (config.oovList != null)
-      for (OOVItem item: config.oovList) 
+    if (config.oovList != null) {
+      for (OOVItem item: config.oovList) { 
         oovWeights.put(Vocabulary.id(item.label), item.weight);
+      }
+    }
   }
   
   @Override
   public ArrayList<String> reportDenseFeatures(int index) {
     denseFeatureIndex = index;
     
-    ArrayList<String> names = new ArrayList<String>();
+    ArrayList<String> names = new ArrayList<>(1);
     names.add(name);
     return names;
   }
@@ -78,7 +80,6 @@ public class OOVPenalty extends StatelessFF {
       Sentence sentence, Accumulator acc) {
     
     if (rule != null && this.ownerID == rule.getOwner()) {
-//      acc.add(name, getValue(rule.getLHS()));
       acc.add(denseFeatureIndex, getValue(rule.getLHS()));
     }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleFF.java b/src/joshua/decoder/ff/RuleFF.java
index 9fb7d3e..48e4340 100644
--- a/src/joshua/decoder/ff/RuleFF.java
+++ b/src/joshua/decoder/ff/RuleFF.java
@@ -18,6 +18,9 @@
  */
 package joshua.decoder.ff;
 
+import static com.google.common.cache.CacheBuilder.newBuilder;
+import static joshua.corpus.Vocabulary.getWords;
+
 import java.util.List;
 
 import joshua.corpus.Vocabulary;
@@ -28,61 +31,94 @@ import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.hypergraph.HGNode;
 import joshua.decoder.segment_file.Sentence;
 
+import com.google.common.cache.Cache;
+
 /**
- *  This feature just counts rules that are used. You can restrict it with a number of flags:
- * 
- *   -owner OWNER
- *    Only count rules owned by OWNER
- *   -target|-source
- *    Only count the target or source side (plus the LHS)
- *
- * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold. 
+ *  This feature fires for rule ids.
+ *  Firing can be restricted to rules from a certain owner, and rule ids
+ *  can be generated from source side and/or target side. 
  */
 public class RuleFF extends StatelessFF {
 
   private enum Sides { SOURCE, TARGET, BOTH };
   
-  private int owner = 0;
-  private Sides sides = Sides.BOTH;
+  private static final String NAME = "RuleFF";
+  // value to fire for features
+  private static final int VALUE = 1;
+  // whether this feature is restricted to a certain grammar/owner
+  private final boolean ownerRestriction;
+  // the grammar/owner this feature is restricted to fire
+  private final int owner;
+  // what part of the rule should be extracted;
+  private final Sides sides;
+  // Strings separating words and rule sides 
+  private static final String SEPARATOR = "~";
+  private static final String SIDES_SEPARATOR = "->";
+  
+  private final Cache<Rule, String> featureCache;
   
   public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
-    super(weights, "RuleFF", args, config);
+    super(weights, NAME, args, config);
+    
+    ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
     
-    owner = Vocabulary.id(parsedArgs.get("owner"));
-    if (parsedArgs.containsKey("source"))
-      sides = Sides.SOURCE;
-    else if (parsedArgs.containsKey("target"))
-      sides = Sides.TARGET;
+    if (parsedArgs.containsKey("sides")) {
+      final String sideValue = parsedArgs.get("sides");
+      if (sideValue.equalsIgnoreCase("source")) {
+        sides = Sides.SOURCE;
+      } else if (sideValue.equalsIgnoreCase("target")) {
+        sides = Sides.TARGET;
+      } else if (sideValue.equalsIgnoreCase("both")){
+        sides = Sides.BOTH;
+      } else {
+        throw new RuntimeException("Unknown side value.");
+      }
+    } else {
+      sides = Sides.BOTH;
+    }
+    
+    // initialize cache
+    if (parsedArgs.containsKey("cacheSize")) {
+      featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+    } else {
+      featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+    }
   }
 
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-
-    if (owner > 0 && rule.getOwner() == owner) {
-      String ruleString = getRuleString(rule);
-      acc.add(ruleString, 1);
+    
+    if (ownerRestriction && rule.getOwner() != owner) {
+      return null;
     }
 
+    String featureName = featureCache.getIfPresent(rule);
+    if (featureName == null) {
+      featureName = getRuleString(rule);
+      featureCache.put(rule, featureName);
+    }
+    acc.add(featureName, VALUE);
+    
     return null;
   }
-
-  private String getRuleString(Rule rule) {
-    String ruleString = "";
-    switch(sides) {
-    case BOTH:
-      ruleString = String.format("%s  %s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords(),
-          rule.getEnglishWords());
-      break;
-
-    case SOURCE:
-      ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getFrenchWords());
-      break;
-
-    case TARGET:
-      ruleString = String.format("%s  %s", Vocabulary.word(rule.getLHS()), rule.getEnglishWords());
-      break;
+  
+  /**
+   * Obtains the feature id for the given rule.
+   * @param rule
+   * @return String representing the feature name.s
+   */
+  private String getRuleString(final Rule rule) {
+    final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+      .append(SIDES_SEPARATOR);
+    if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+      sb.append(getWords(rule.getFrench(), SEPARATOR));
+    }
+    sb.append(SIDES_SEPARATOR);
+    if (sides == Sides.TARGET || sides == Sides.BOTH) {
+      sb.append(getWords(rule.getEnglish(), SEPARATOR));
     }
-    return ruleString.replaceAll("[ =]", "~");
+    return sb.toString();
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleLength.java b/src/joshua/decoder/ff/RuleLength.java
index 645905a..ce02282 100644
--- a/src/joshua/decoder/ff/RuleLength.java
+++ b/src/joshua/decoder/ff/RuleLength.java
@@ -32,6 +32,8 @@ import joshua.decoder.segment_file.Sentence;
  * source side, its target side, and a feature that pairs them.
  */
 public class RuleLength extends StatelessFF {
+  
+  private static final int VALUE = 1;
 
   public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "RuleLength", args, config);
@@ -40,12 +42,11 @@ public class RuleLength extends StatelessFF {
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-    int sourceLen = rule.getFrench().length;
-    int targetLen = rule.getEnglish().length;
-    acc.add(String.format("%s_sourceLength%d", name, sourceLen), 1);
-    acc.add(String.format("%s_targetLength%d", name, targetLen), 1);
-    acc.add(String.format("%s_pairLength%d-%d", name, sourceLen, targetLen), 1);
-
+    int sourceLength = rule.getFrench().length;
+    int targetLength = rule.getEnglish().length;
+    acc.add(name + "_source" + sourceLength, VALUE);
+    acc.add(name + "_target" + sourceLength, VALUE);
+    acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
     return null;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleShape.java b/src/joshua/decoder/ff/RuleShape.java
index e243528..3bd10a8 100644
--- a/src/joshua/decoder/ff/RuleShape.java
+++ b/src/joshua/decoder/ff/RuleShape.java
@@ -20,6 +20,7 @@ package joshua.decoder.ff;
 
 import java.util.List;
 
+import joshua.corpus.Vocabulary;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.chart_parser.SourcePath;
 import joshua.decoder.ff.state_maintenance.DPState;
@@ -36,38 +37,63 @@ public class RuleShape extends StatelessFF {
     super(weights, "RuleShape", args, config);
   }
 
-  private int gettype(int id) {
-    if (id < 0)
-      return -1;
-    return 1;
+  private enum WordType {
+    N("N"), T("x"), P("+");
+    private final String string;
+    private boolean repeats;
+
+    private WordType(final String string) {
+      this.string = string;
+      this.repeats = false;
+    }
+    
+    private void setRepeats() {
+      repeats = true;
+    }
+
+    @Override
+    public String toString() {
+      if (repeats) {
+        return this.string + "+";
+      }
+      return this.string;
+    }
+  }
+
+  private WordType getWordType(int id) {
+    if (Vocabulary.nt(id)) {
+      return WordType.N;
+    } else {
+      return WordType.T;
+    }
   }
   
-  private String pattern(int[] ids) {
-    StringBuilder pattern = new StringBuilder();
-    int curtype = gettype(ids[0]);
-    int curcount = 1;
+  /**
+   * Returns a String describing the rule pattern.
+   */
+  private String getRulePattern(int[] ids) {
+    final StringBuilder pattern = new StringBuilder();
+    WordType currentType = getWordType(ids[0]);
     for (int i = 1; i < ids.length; i++) {
-      if (gettype(ids[i]) != curtype) {
-        pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
-        curtype = gettype(ids[i]);
-        curcount = 1;
+      if (getWordType(ids[i]) != currentType) {
+        pattern.append(currentType.toString());
+        currentType = getWordType(ids[i]);
       } else {
-        curcount++;
+        currentType.setRepeats();
       }
     }
-    pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
+    pattern.append(currentType.toString());
     return pattern.toString();
   }
   
   @Override
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
-    String sourceShape = pattern(rule.getFrench());
-    String targetShape = pattern(rule.getEnglish());
-    acc.add(String.format("%s_source_%s", name, sourceShape), 1);
-    acc.add(String.format("%s_target_%s", name, targetShape), 1);
-    acc.add(String.format("%s_both_%s__%s", name, sourceShape, targetShape), 1);
-
+    final String sourceShape = getRulePattern(rule.getFrench());
+    final String targetShape = getRulePattern(rule.getEnglish());
+    acc.add(name + "_source_" + sourceShape, 1);
+    acc.add(name + "_target_" + sourceShape, 1);
+    acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
     return null;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/src/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/WordPenalty.java b/src/joshua/decoder/ff/WordPenalty.java
index 583b59c..d72a4e6 100644
--- a/src/joshua/decoder/ff/WordPenalty.java
+++ b/src/joshua/decoder/ff/WordPenalty.java
@@ -37,12 +37,15 @@ import joshua.decoder.segment_file.Sentence;
 public final class WordPenalty extends StatelessFF {
 
   private float OMEGA = -(float) Math.log10(Math.E); // -0.435
+  private final boolean isCky;
 
   public WordPenalty(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "WordPenalty", args, config);
 
     if (parsedArgs.containsKey("value"))
       OMEGA = Float.parseFloat(parsedArgs.get("value"));
+    
+    isCky = config.search_algorithm.equals("cky");
   }
 
   @Override
@@ -52,10 +55,9 @@ public final class WordPenalty extends StatelessFF {
     if (rule != null) {
       // TODO: this is an inefficient way to do this. Find a better way to not apply this rule
       // to start and stop glue rules when phrase-based decoding.
-      if (config.search_algorithm.equals("cky") 
-          || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE))
-        // acc.add(name, OMEGA * (rule.getEnglish().length - rule.getArity()));
+      if (isCky || (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE)) {
         acc.add(denseFeatureIndex, OMEGA * (rule.getEnglish().length - rule.getArity()));
+      }
     }
       
     return null;
@@ -64,7 +66,7 @@ public final class WordPenalty extends StatelessFF {
   @Override
   public ArrayList<String> reportDenseFeatures(int index) {
     denseFeatureIndex = index;
-    ArrayList<String> names = new ArrayList<String>();
+    ArrayList<String> names = new ArrayList<>(1);
     names.add(name);
     return names;
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index 6e0d90f..0a29646 100644
--- a/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -49,7 +49,7 @@ public class LMGrammarBerkeleyTest {
   public void verifyLM() {
     joshuaConfig = new JoshuaConfiguration();
     joshuaConfig.processCommandLineOptions(OPTIONS);
-    joshuaConfig.features.add("feature_function = LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+    joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
     decoder = new Decoder(joshuaConfig, null);
     String translation = decode(INPUT).toString();
     assertEquals(lmFile, "tm_glue_0=2.000 lm_0=-7.153\n", translation);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java
index f438ccd..220bced 100644
--- a/tst/joshua/system/MultithreadedTranslationTests.java
+++ b/tst/joshua/system/MultithreadedTranslationTests.java
@@ -64,7 +64,7 @@ public class MultithreadedTranslationTests {
     joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
     joshuaConfig.goal_symbol = "[GOAL]";
     joshuaConfig.default_non_terminal = "[X]";
-    joshuaConfig.features.add("feature_function = OOVPenalty");
+    joshuaConfig.features.add("OOVPenalty");
     joshuaConfig.weights.add("tm_pt_0 1");
     joshuaConfig.weights.add("tm_pt_1 1");
     joshuaConfig.weights.add("tm_pt_2 1");

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/25a92cbc/tst/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredTranslationTest.java b/tst/joshua/system/StructuredTranslationTest.java
index 0608a65..249eabf 100644
--- a/tst/joshua/system/StructuredTranslationTest.java
+++ b/tst/joshua/system/StructuredTranslationTest.java
@@ -85,7 +85,7 @@ public class StructuredTranslationTest {
     joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
     joshuaConfig.goal_symbol = "[GOAL]";
     joshuaConfig.default_non_terminal = "[X]";
-    joshuaConfig.features.add("feature_function = OOVPenalty");
+    joshuaConfig.features.add("OOVPenalty");
     joshuaConfig.weights.add("tm_pt_0 1");
     joshuaConfig.weights.add("tm_pt_1 1");
     joshuaConfig.weights.add("tm_pt_2 1");