You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:46:09 UTC
[58/60] incubator-joshua git commit: Merge branch 'maven-multi-module' of https://github.com/logogin/incubator-joshua into maven-multi-module

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 0000000,5278172..92ee740
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@@ -1,0 -1,106 +1,108 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff;
+ 
+ import java.util.ArrayList;
+ import java.util.HashMap;
+ import java.util.List;
+ 
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ 
+ /**
+  * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
+  * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
+  * with respect to the translation model, we create a rule that pushes that word through
+  * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
+  * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
+  * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
+  * 
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public class OOVPenalty extends StatelessFF {
 -  private final int ownerID;
++  private final OwnerId ownerID;
+   
+   /* The default value returned for OOVs. Can be overridden with -oov-list */
+   private final float defaultValue = -100f;
+   private final HashMap<Integer,Float> oovWeights;
+ 
+   public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, "OOVPenalty", args, config);
+ 
 -    ownerID = Vocabulary.id("oov");
++    ownerID = OwnerMap.register("oov");
+     oovWeights = new HashMap<Integer,Float>();
+     
+     if (config.oovList != null) {
+       for (OOVItem item: config.oovList) { 
+         oovWeights.put(Vocabulary.id(item.label), item.weight);
+       }
+     }
+   }
+   
+   @Override
+   public ArrayList<String> reportDenseFeatures(int index) {
+     denseFeatureIndex = index;
+     
+     ArrayList<String> names = new ArrayList<>(1);
+     names.add(name);
+     return names;
+   }
+ 
+   /**
+    * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
+    * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
+    * cached when the feature was created.
+    */
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+     
 -    if (rule != null && this.ownerID == rule.getOwner()) {
++    if (rule != null && this.ownerID.equals(rule.getOwner())) {
+       acc.add(denseFeatureIndex, getValue(rule.getLHS()));
+     }
+ 
+     return null;
+   }
+   
+   /**
+    * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
+    * rules (which are added for all words, not just ones without translation options) get sorted
+    * to the bottom during cube pruning.
+    * 
+    * Important! estimateCost returns the *weighted* feature value.
+    */
+   @Override
+   public float estimateCost(Rule rule, Sentence sentence) {
 -    if (rule != null && this.ownerID == rule.getOwner())
++    if (rule != null && this.ownerID.equals(rule.getOwner()))
+       return weights.getDense(denseFeatureIndex) * getValue(rule.getLHS());
+     return 0.0f;
+   }
+   
+   private float getValue(int lhs) {
+     return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
index 0000000,2324292..7ae3dbc
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
@@@ -1,0 -1,135 +1,134 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff;
+ 
+ import java.util.ArrayList;
+ import java.util.List;
+ 
 -import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.tm.Grammar;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ 
+ /**
+  * This feature handles the list of features that are found with grammar rules in the grammar file.
+  * dense features that may be associated with the rules in a grammar file. The feature names of
+  * these dense rules are a function of the phrase model owner. When the feature is loaded, it
+  * queries the weights for the set of features that are active for this grammar, storing them in an
+  * array.
+  * 
+  * @author Matt Post post@cs.jhu.edu
+  * @author Zhifei Li zhifei.work@gmail.com
+  */
+ 
+ public class PhraseModel extends StatelessFF {
+ 
+   /* The owner of the grammar. */
 -  private int ownerID;
 -  private String owner;
++  private final OwnerId ownerID;
++  private final String owner;
+ 
+   private float[] phrase_weights = null;
+ 
+   public PhraseModel(FeatureVector weights, String[] args, JoshuaConfiguration config, Grammar g) {
+     super(weights, "tm_", args, config);
+ 
 -    String owner = parsedArgs.get("owner");
 -    this.name = String.format("tm_%s", owner);
++    // Store the owner and name
++    this.owner = parsedArgs.get("owner");
++    this.ownerID = OwnerMap.register(owner);
++    this.name = String.format("tm_%s", this.owner);
+ 
+     /*
+      * Determine the number of features by querying the example grammar that was passed in.
+      */
+     phrase_weights = new float[g.getNumDenseFeatures()];
 -//    System.err.println(String.format("GOT %d FEATURES FOR %s", g.getNumDenseFeatures(), owner));
+     for (int i = 0; i < phrase_weights.length; i++)
+       phrase_weights[i] = weights.getSparse(String.format("tm_%s_%d", owner, i));
 -
 -    // Store the owner.
 -    this.owner = owner;
 -    this.ownerID = Vocabulary.id(owner);
++    
+   }
+ 
+   /**
+    * Just register a single weight, tm_OWNER, and use that to set its precomputed cost
+    */
+   @Override
+   public ArrayList<String> reportDenseFeatures(int index) {
+     denseFeatureIndex = index;
+ 
+     ArrayList<String> names = new ArrayList<String>();
+     for (int i = 0; i < phrase_weights.length; i++)
+       names.add(String.format("tm_%s_%d", owner, i));
+     return names;
+   }
+ 
+   /**
+    * Estimates the cost of applying this rule, which is just the score of the precomputable feature
+    * functions.
+    */
+   @Override
+   public float estimateCost(final Rule rule, Sentence sentence) {
+ 
 -    if (rule != null && rule.getOwner() == ownerID) {
++    if (rule != null && rule.getOwner().equals(ownerID)) {
+       if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY)
+         rule.setPrecomputableCost(phrase_weights, weights);
+ 
+       return rule.getPrecomputableCost();
+     }
+ 
+     return 0.0f;
+   }
+ 
+   /**
+    * Just chain to computeFeatures(rule), since this feature doesn't use the sourcePath or sentID. *
+    */
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+ 
 -    if (rule != null && rule.getOwner() == ownerID) {
++    if (rule != null && rule.getOwner().equals(ownerID)) {
+       /*
+        * Here, we peak at the Accumulator object. If it's asking for scores, then we don't bother to
+        * add each feature, but rather compute the inner product and add *that*. This is totally
+        * cheating; the Accumulator is supposed to be a generic object. But without this cheat
+        */
+       if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY) {
+         // float score = rule.getFeatureVector().innerProduct(weights);
+         rule.setPrecomputableCost(phrase_weights, weights);
+       }
+       
+ //      System.err.println(String.format("RULE = %s / %f", rule.getEnglishWords(), rule.getPrecomputableCost()));
+       for (int k = 0; k < phrase_weights.length; k++) {
+ //        System.err.println(String.format("k = %d, denseFeatureIndex = %d, owner = %s, ownerID = %d", k, denseFeatureIndex, owner, ownerID));
+         acc.add(k + denseFeatureIndex, rule.getDenseFeature(k));
+       }
+       
+       for (String key: rule.getFeatureVector().keySet())
+         acc.add(key, rule.getFeatureVector().getSparse(key));
+     }
+ 
+     return null;
+   }
+ 
+   public String toString() {
 -    return name + " " + Vocabulary.word(ownerID);
++    return name;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
index 0000000,3c38e60..9eecd0c
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
@@@ -1,0 -1,86 +1,87 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff;
+ 
+ import java.util.ArrayList;
 -import java.util.List;	
++import java.util.List;
+ 
 -import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.phrase.Hypothesis;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ 
+ /**
+  *  This feature just counts rules that are used. You can restrict it with a number of flags:
+  * 
+  *   -owner OWNER
+  *    Only count rules owned by OWNER
+  *   -target|-source
+  *    Only count the target or source side (plus the LHS)
+  *
+  * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold. 
+  */
+ public class PhrasePenalty extends StatelessFF {
+ 
 -  private int owner = 0;
++  private final OwnerId owner;
+   private float value = 1.0f;
+   
+   public PhrasePenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, "PhrasePenalty", args, config);
+     if (parsedArgs.containsKey("owner"))
 -      this.owner = Vocabulary.id(parsedArgs.get("owner"));
++      this.owner = OwnerMap.register(parsedArgs.get("owner"));
+     else // default
 -      this.owner = Vocabulary.id("pt"); 
++      this.owner = OwnerMap.register("pt"); 
+   }
+ 
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+ 
+     if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE 
 -        && (owner == 0 || rule.getOwner() == owner))
++        && (rule.getOwner().equals(owner)))
+       acc.add(denseFeatureIndex, value);
+ 
+     return null;
+   }
+     
+   @Override
+   public ArrayList<String> reportDenseFeatures(int index) {
+     denseFeatureIndex = index;
+     ArrayList<String> names = new ArrayList<String>();
+     names.add(name);
+     return names;
+   }
+   
+   /**
+    * Returns the *weighted* estimate.
+    * 
+    */
+   @Override
+   public float estimateCost(Rule rule, Sentence sentence) {
+     if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE 
 -        && (owner == 0 || rule.getOwner() == owner))
++        && (rule.getOwner().equals(owner)))
+       return weights.getDense(denseFeatureIndex) * value;
+     return 0.0f;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
index 0000000,5ba0c66..3ffbf65
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
@@@ -1,0 -1,74 +1,77 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff;
+ 
+ import java.util.List;
+ 
 -import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+ /*
+  * This feature computes a bin for the rule and activates a feature for it. It requires access to
+  * the index of the RarityPenalty field, from which the rule count can be computed.
+  */
+ public class RuleCountBin extends StatelessFF {
+ 
+   private static final Logger LOG = LoggerFactory.getLogger(RuleCountBin.class);
+   private int field = -1;
++  private final OwnerId owner;
+ 
+   public RuleCountBin(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, "RuleCountBin", args, config);
++    owner = OwnerMap.register("pt");
+ 
+     field = Integer.parseInt(parsedArgs.get("field"));
+   }
+ 
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+ 
 -    if (rule.getOwner() != Vocabulary.id("pt"))
++    if (rule.getOwner().equals(owner))
+       return null;
+     
+     float rarityPenalty = -rule.getFeatureVector().getSparse(String.format("tm_pt_%d", field));
+     int count = (int) (1.0 - Math.log(rarityPenalty));
+ 
+     String feature = "RuleCountBin_inf";
+ 
+     int[] bins = { 1, 2, 4, 8, 16, 32, 64, 128, 1000, 10000 };
+     for (int k : bins) {
+       if (count <= k) {
+         feature = String.format("RuleCountBin_%d", k);
+         break;
+       }
+     }
+ 
+     LOG.debug("RuleCountBin({}) = {} ==> {}", rarityPenalty, count, feature);
+     
+     acc.add(feature, 1.0f);
+ 
+     return null;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index 0000000,909e481..308d38a
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@@ -1,0 -1,123 +1,126 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff;
+ 
+ import static com.google.common.cache.CacheBuilder.newBuilder;
++import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
+ 
+ import java.util.List;
+ 
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ 
+ import com.google.common.cache.Cache;
+ 
+ /**
+  *  This feature fires for rule ids.
+  *  Firing can be restricted to rules from a certain owner, and rule ids
+  *  can be generated from source side and/or target side. 
+  */
+ public class RuleFF extends StatelessFF {
+ 
+   private enum Sides { SOURCE, TARGET, BOTH };
+   
+   private static final String NAME = "RuleFF";
+   // value to fire for features
+   private static final int VALUE = 1;
+   // whether this feature is restricted to a certain grammar/owner
+   private final boolean ownerRestriction;
+   // the grammar/owner this feature is restricted to fire
 -  private final int owner;
++  private final OwnerId owner;
+   // what part of the rule should be extracted;
+   private final Sides sides;
+   // Strings separating words and rule sides 
+   private static final String SEPARATOR = "~";
+   private static final String SIDES_SEPARATOR = "->";
+   
+   private final Cache<Rule, String> featureCache;
+   
+   public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, NAME, args, config);
+     
+     ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
 -    owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
++    owner = ownerRestriction ? OwnerMap.register(parsedArgs.get("owner")) : UNKNOWN_OWNER_ID;
+     
+     if (parsedArgs.containsKey("sides")) {
+       final String sideValue = parsedArgs.get("sides");
+       if (sideValue.equalsIgnoreCase("source")) {
+         sides = Sides.SOURCE;
+       } else if (sideValue.equalsIgnoreCase("target")) {
+         sides = Sides.TARGET;
+       } else if (sideValue.equalsIgnoreCase("both")){
+         sides = Sides.BOTH;
+       } else {
+         throw new RuntimeException("Unknown side value.");
+       }
+     } else {
+       sides = Sides.BOTH;
+     }
+     
+     // initialize cache
+     if (parsedArgs.containsKey("cacheSize")) {
+       featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+     } else {
+       featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+     }
+   }
+ 
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+     
 -    if (ownerRestriction && rule.getOwner() != owner) {
++    if (ownerRestriction && !rule.getOwner().equals(owner)) {
+       return null;
+     }
+ 
+     String featureName = featureCache.getIfPresent(rule);
+     if (featureName == null) {
+       featureName = getRuleString(rule);
+       featureCache.put(rule, featureName);
+     }
+     acc.add(featureName, VALUE);
+     
+     return null;
+   }
+   
+   /**
+    * Obtains the feature id for the given rule.
+    * @param rule
+    * @return String representing the feature name.s
+    */
+   private String getRuleString(final Rule rule) {
+     final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+       .append(SIDES_SEPARATOR);
+     if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+       sb.append(Vocabulary.getWords(rule.getFrench(), SEPARATOR));
+     }
+     sb.append(SIDES_SEPARATOR);
+     if (sides == Sides.TARGET || sides == Sides.BOTH) {
+       sb.append(Vocabulary.getWords(rule.getEnglish(), SEPARATOR));
+     }
+     return sb.toString();
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
index 0000000,fa4c4af..861cf35
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
@@@ -1,0 -1,365 +1,368 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.fragmentlm;
+ 
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.Collection;
+ import java.util.Collections;
+ import java.util.HashMap;
+ import java.util.List;
+ import java.util.Stack;
+ 
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.ff.StatefulFF;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.hypergraph.HyperEdge;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+ /**
+  * <p>Feature function that reads in a list of language model fragments and matches them against the
+  * hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments
+  * (supplied as input) are assembled. These LM fragments are presumably useful in ensuring
+  * grammaticality and can be independent of the translation model fragments.</p>
+  * 
+  * <p>Usage: in the Joshua Configuration file, put</p>
+  * 
+  * <code>feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE</code>
+  * 
+  * <p>LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for.
+  * The format of the file is one fragment per line in PTB format, e.g.:</p>
+  * 
+  * <code>(S NP (VP (VBD said) SBAR) (. .))</code>
+  * 
+  * <p>RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format
+  * that Joshua uses. This mapping is necessary because Joshua's rules have been flattened, meaning
+  * that their internal structure has been removed, yet this structure is needed for matching LM
+  * fragments. The format of the file is</p>
+  * 
+  * <code>FRAGMENT ||| RULE-TARGET-SIDE</code>
+  * 
+  * <p>for example,</p>
+  * 
+  * <code>(S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP
+  * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR</code>
+  * 
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public class FragmentLMFF extends StatefulFF {
+ 
+   private static final Logger LOG = LoggerFactory.getLogger(FragmentLMFF.class);
+ 
+   /*
+    * When building a fragment from a rule rooted in the hypergraph, this parameter determines how
+    * deep we'll go. Smaller values mean less hypergraph traversal but may also limit the LM
+    * fragments that can be fired.
+    */
+   private int BUILD_DEPTH = 1;
+ 
+   /*
+    * The maximum depth of a fragment, defined as the longest path from the fragment root to any of
+    * its leaves.
+    */
+   private int MAX_DEPTH = 0;
+ 
+   /*
+    * This is the minimum depth for lexicalized LM fragments. This allows you to easily exclude small
+    * depth-one fragments that may be overfit to the training data. A depth of 1 (the default) does
+    * not exclude any fragments.
+    */
+   private int MIN_LEX_DEPTH = 1;
+ 
+   /*
+    * Set to true to activate meta-features.
+    */
+   private boolean OPTS_DEPTH = false;
+ 
+   /*
+    * This contains a list of the language model fragments, indexed by LHS.
+    */
+   private HashMap<String, ArrayList<Tree>> lmFragments = null;
+ 
+   private int numFragments = 0;
+ 
+   /* The location of the file containing the language model fragments */
+   private String fragmentLMFile = "";
+ 
+   /**
+    * @param weights a {@link org.apache.joshua.decoder.ff.FeatureVector} with weights
+    * @param args arguments passed to the feature function
+    * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration}
+    */
+   public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, "FragmentLMFF", args, config);
+ 
+     lmFragments = new HashMap<String, ArrayList<Tree>>();
+ 
+     fragmentLMFile = parsedArgs.get("lm");
+     BUILD_DEPTH = Integer.parseInt(parsedArgs.get("build-depth"));
+     MAX_DEPTH = Integer.parseInt(parsedArgs.get("max-depth"));
+     MIN_LEX_DEPTH = Integer.parseInt(parsedArgs.get("min-lex-depth"));
+ 
+     /* Read in the language model fragments */
+     try {
+       Collection<Tree> trees = PennTreebankReader.readTrees(fragmentLMFile);
+       for (Tree fragment : trees) {
+         addLMFragment(fragment);
+ 
+         // System.err.println(String.format("Read fragment: %s",
+         // lmFragments.get(lmFragments.size()-1)));
+       }
+     } catch (IOException e) {
+       throw new RuntimeException(String.format("* WARNING: couldn't read fragment LM file '%s'",
+           fragmentLMFile), e);
+     }
+     LOG.info("FragmentLMFF: Read {} LM fragments from '{}'", numFragments, fragmentLMFile);
+   }
+ 
+   /**
+    * Add the provided fragment to the language model, subject to some filtering.
+    * 
+    * @param fragment a {@link org.apache.joshua.decoder.ff.fragmentlm.Tree} fragment
+    */
+   public void addLMFragment(Tree fragment) {
+     if (lmFragments == null)
+       return;
+ 
+     int fragmentDepth = fragment.getDepth();
+ 
+     if (MAX_DEPTH != 0 && fragmentDepth > MAX_DEPTH) {
+       LOG.warn("Skipping fragment {} (depth {} > {})", fragment, fragmentDepth, MAX_DEPTH);
+       return;
+     }
+ 
+     if (MIN_LEX_DEPTH > 1 && fragment.isLexicalized() && fragmentDepth < MIN_LEX_DEPTH) {
+       LOG.warn("Skipping fragment {} (lex depth {} < {})", fragment, fragmentDepth, MIN_LEX_DEPTH);
+       return;
+     }
+ 
+     if (lmFragments.get(fragment.getRule()) == null) {
+       lmFragments.put(fragment.getRule(), new ArrayList<Tree>());
+     }
+     lmFragments.get(fragment.getRule()).add(fragment);
+     numFragments++;
+   }
+   
+   /**
+    * This function computes the features that fire when the current rule is applied. The features
+    * that fire are any LM fragments that match the fragment associated with the current rule. LM
+    * fragments may recurse over the tail nodes, following 1-best backpointers until the fragment
+    * either matches or fails.
+    * 
+    * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+    * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+    * @param i todo
+    * @param j todo
+    * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+    * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+    * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+    * @return the new dynamic programming state (null for stateless features)
+    */
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath, 
+       Sentence sentence, Accumulator acc) {
+ 
+     /*
+      * Get the fragment associated with the target side of this rule.
+      * 
+      * This could be done more efficiently. For example, just build the tree fragment once and then
+      * pattern match against it. This would circumvent having to build the tree possibly once every
+      * time you try to apply a rule.
+      */
+     Tree baseTree = Tree.buildTree(rule, tailNodes, BUILD_DEPTH);
+ 
+     Stack<Tree> nodeStack = new Stack<Tree>();
+     nodeStack.add(baseTree);
+     while (!nodeStack.empty()) {
+       Tree tree = nodeStack.pop();
+       if (tree == null)
+         continue;
+ 
+       if (lmFragments.get(tree.getRule()) != null) {
+         for (Tree fragment : lmFragments.get(tree.getRule())) {
+ //           System.err.println(String.format("Does\n  %s match\n  %s??\n  -> %s", fragment, tree,
+ //           match(fragment, tree)));
+ 
+           if (fragment.getLabel() == tree.getLabel() && match(fragment, tree)) {
+ //             System.err.println(String.format("  FIRING: matched %s against %s", fragment, tree));
+             acc.add(fragment.escapedString(), 1);
+             if (OPTS_DEPTH)
+               if (fragment.isLexicalized())
+                 acc.add(String.format("FragmentFF_lexdepth%d", fragment.getDepth()), 1);
+               else
+                 acc.add(String.format("FragmentFF_depth%d", fragment.getDepth()), 1);
+           }
+         }
+       }
+ 
+       // We also need to try matching rules against internal nodes of the fragment corresponding to
+       // this
+       // rule
+       if (tree.getChildren() != null)
+         for (Tree childNode : tree.getChildren()) {
+           if (!childNode.isBoundary())
+             nodeStack.add(childNode);
+         }
+     }
+ 
+     return new FragmentState(baseTree);
+   }
+ 
+   /**
+    * Matches the fragment against the (possibly partially-built) tree. Assumption
+    * 
+    * @param fragment the language model fragment
+    * @param tree the tree to match against (expanded from the hypergraph)
+    * @return
+    */
+   private boolean match(Tree fragment, Tree tree) {
+     // System.err.println(String.format("MATCH(%s,%s)", fragment, tree));
+ 
+     /* Make sure the root labels match. */
+     if (fragment.getLabel() != tree.getLabel()) {
+       return false;
+     }
+ 
+     /* Same number of kids? */
+     List<Tree> fkids = fragment.getChildren();
+     if (fkids.size() > 0) {
+       List<Tree> tkids = tree.getChildren();
+       if (fkids.size() != tkids.size()) {
+         return false;
+       }
+ 
+       /* Do the kids match on all labels? */
+       for (int i = 0; i < fkids.size(); i++)
+         if (fkids.get(i).getLabel() != tkids.get(i).getLabel())
+           return false;
+ 
+       /* Recursive match. */
+       for (int i = 0; i < fkids.size(); i++) {
+         if (!match(fkids.get(i), tkids.get(i)))
+           return false;
+       }
+     }
+ 
+     return true;
+   }
+ 
+   @Override
+   public DPState computeFinal(HGNode tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence,
+       Accumulator acc) {
+     // TODO Auto-generated method stub
+     return null;
+   }
+ 
+   @Override
+   public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
+     // TODO Auto-generated method stub
+     return 0;
+   }
+ 
+   @Override
+   public float estimateCost(Rule rule, Sentence sentence) {
+     // TODO Auto-generated method stub
+     return 0;
+   }
+   
+   public static void main(String[] args) {
+     /* Add an LM fragment, then create a dummy multi-level hypergraph to match the fragment against. */
+     // FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(), (StateComputer) null, "");
+     FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(),
+         new String[] {"-lm", "test/fragments.txt", "-map", "test/mapping.txt"}, null);
+   
+     Tree fragment = Tree.fromString("(S NP (VP (VBD \"said\") SBAR) (. \".\"))");
+   
+     Rule ruleS = new HieroFormatReader()
+         .parseLine("[S] ||| the man [VP,1] [.,2] ||| the man [VP,1] [.,2] ||| 0");
+     Rule ruleVP = new HieroFormatReader()
+         .parseLine("[VP] ||| said [SBAR,1] ||| said [SBAR,1] ||| 0");
+     Rule ruleSBAR = new HieroFormatReader()
+         .parseLine("[SBAR] ||| that he was done ||| that he was done ||| 0");
+     Rule rulePERIOD = new HieroFormatReader().parseLine("[.] ||| . ||| . ||| 0");
+   
 -    ruleS.setOwner(0);
 -    ruleVP.setOwner(0);
 -    ruleSBAR.setOwner(0);
 -    rulePERIOD.setOwner(0);
++    final OwnerId owner = OwnerMap.register("0");
++    ruleS.setOwner(owner);
++    ruleVP.setOwner(owner);
++    ruleSBAR.setOwner(owner);
++    rulePERIOD.setOwner(owner);
+   
+     HyperEdge edgeSBAR = new HyperEdge(ruleSBAR, 0.0f, 0.0f, null, (SourcePath) null);
+   
+     HGNode nodeSBAR = new HGNode(3, 7, ruleSBAR.getLHS(), null, edgeSBAR, 0.0f);
+     ArrayList<HGNode> tailNodesVP = new ArrayList<HGNode>();
+     Collections.addAll(tailNodesVP, nodeSBAR);
+     HyperEdge edgeVP = new HyperEdge(ruleVP, 0.0f, 0.0f, tailNodesVP, (SourcePath) null);
+     HGNode nodeVP = new HGNode(2, 7, ruleVP.getLHS(), null, edgeVP, 0.0f);
+   
+     HyperEdge edgePERIOD = new HyperEdge(rulePERIOD, 0.0f, 0.0f, null, (SourcePath) null);
+     HGNode nodePERIOD = new HGNode(7, 8, rulePERIOD.getLHS(), null, edgePERIOD, 0.0f);
+   
+     ArrayList<HGNode> tailNodes = new ArrayList<HGNode>();
+     Collections.addAll(tailNodes, nodeVP, nodePERIOD);
+   
+     Tree tree = Tree.buildTree(ruleS, tailNodes, 1);
+     boolean matched = fragmentLMFF.match(fragment, tree);
+     LOG.info("Does\n  {} match\n  {}??\n  -> {}", fragment, tree, matched);
+   }
+ 
+   /**
+    * Maintains a state pointer used by KenLM to implement left-state minimization. 
+    * 
+    * @author Matt Post post@cs.jhu.edu
+    * @author Juri Ganitkevitch juri@cs.jhu.edu
+    */
+   public class FragmentState extends DPState {
+ 
+     private Tree tree = null;
+ 
+     public FragmentState(Tree tree) {
+       this.tree = tree;
+     }
+ 
+     /**
+      * Every tree is unique.
+      * 
+      * Some savings could be had here if we grouped together items with the same string.
+      */
+     @Override
+     public int hashCode() {
+       return tree.hashCode();
+     }
+ 
+     @Override
+     public boolean equals(Object other) {
+       return (other instanceof FragmentState && this == other);
+     }
+ 
+     @Override
+     public String toString() {
+       return String.format("[FragmentState %s]", tree);
+     }
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
index 0000000,3fea410..9388ed7
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
@@@ -1,0 -1,527 +1,495 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.lm;
+ 
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.Arrays;
 -import java.util.HashMap;
+ import java.util.LinkedList;
+ import java.util.List;
+ 
 -import com.google.common.primitives.Ints;
 -
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.Support;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.ff.StatefulFF;
+ import org.apache.joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley;
 -import org.apache.joshua.decoder.ff.lm.KenLM;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.apache.joshua.util.FormatUtils;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
++import com.google.common.annotations.VisibleForTesting;
++import com.google.common.primitives.Ints;
++
+ /**
+  * This class performs the following:
+  * <ol>
+  * <li>Gets the additional LM score due to combinations of small items into larger ones by using
+  * rules</li>
+  * <li>Gets the LM state</li>
+  * <li>Gets the left-side LM state estimation score</li>
+  * </ol>
 - * 
++ *
+  * @author Matt Post post@cs.jhu.edu
+  * @author Juri Ganitkevitch juri@cs.jhu.edu
+  * @author Zhifei Li, zhifei.work@gmail.com
+  */
+ public class LanguageModelFF extends StatefulFF {
+ 
 -  private static final Logger LOG = LoggerFactory.getLogger(LanguageModelFF.class);
++  static final Logger LOG = LoggerFactory.getLogger(LanguageModelFF.class);
+ 
+   public static int LM_INDEX = 0;
+   private int startSymbolId;
+ 
+   /**
+    * N-gram language model. We assume the language model is in ARPA format for equivalent state:
 -   * 
++   *
+    * <ol>
+    * <li>We assume it is a backoff lm, and high-order ngram implies low-order ngram; absense of
+    * low-order ngram implies high-order ngram</li>
+    * <li>For a ngram, existence of backoffweight =&gt; existence a probability Two ways of dealing with
+    * low counts:
+    * <ul>
+    * <li>SRILM: don't multiply zeros in for unknown words</li>
+    * <li>Pharaoh: cap at a minimum score exp(-10), including unknown words</li>
+    * </ul>
+    * </li>
+    * </ol>
+    */
+   protected NGramLanguageModel languageModel;
+ 
+   /**
+    * We always use this order of ngram, though the LMGrammar may provide higher order probability.
+    */
+   protected final int ngramOrder;
+ 
+   /*
+    * We cache the weight of the feature since there is only one.
+    */
+   protected float weight;
+   protected String type;
+   protected String path;
+ 
+   /* Whether this is a class-based LM */
 -  private boolean isClassLM;
++  protected boolean isClassLM;
+   private ClassMap classMap;
+ 
 -  protected class ClassMap {
 -
 -    private final int OOV_id = Vocabulary.getUnknownId();
 -    private HashMap<Integer, Integer> classMap;
 -
 -    public ClassMap(String file_name) throws IOException {
 -      this.classMap = new HashMap<Integer, Integer>();
 -      read(file_name);
 -    }
 -
 -    public int getClassID(int wordID) {
 -      return this.classMap.getOrDefault(wordID, OOV_id);
 -    }
 -
 -    /**
 -     * Reads a class map from file.
 -     * 
 -     * @param file_name
 -     * @throws IOException
 -     */
 -    private void read(String file_name) throws IOException {
 -
 -      int lineno = 0;
 -      for (String line: new org.apache.joshua.util.io.LineReader(file_name, false)) {
 -        lineno++;
 -        String[] lineComp = line.trim().split("\\s+");
 -        try {
 -          this.classMap.put(Vocabulary.id(lineComp[0]), Vocabulary.id(lineComp[1]));
 -        } catch (java.lang.ArrayIndexOutOfBoundsException e) {
 -          LOG.warn("bad vocab line #{} '{}'", lineno, line);
 -          LOG.warn(e.getMessage(), e);
 -        }
 -      }
 -    }
 -
 -  }
 -
+   public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, String.format("lm_%d", LanguageModelFF.LM_INDEX++), args, config);
+ 
+     this.type = parsedArgs.get("lm_type");
 -    this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order")); 
++    this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
+     this.path = parsedArgs.get("lm_file");
+ 
 -    if (parsedArgs.containsKey("class_map"))
 -      try {
 -        this.isClassLM = true;
 -        this.classMap = new ClassMap(parsedArgs.get("class_map"));
 -      } catch (IOException e) {
 -        // TODO Auto-generated catch block
 -        e.printStackTrace();
 -      }
++    if (parsedArgs.containsKey("class_map")) {
++      this.isClassLM = true;
++      this.classMap = new ClassMap(parsedArgs.get("class_map"));
++    }
+ 
+     // The dense feature initialization hasn't happened yet, so we have to retrieve this as sparse
+     this.weight = weights.getSparse(name);
+ 
+     initializeLM();
+   }
+ 
+   @Override
+   public ArrayList<String> reportDenseFeatures(int index) {
+     denseFeatureIndex = index;
+ 
 -    ArrayList<String> names = new ArrayList<String>();
++    final ArrayList<String> names = new ArrayList<String>(1);
+     names.add(name);
+     return names;
+   }
+ 
+   /**
+    * Initializes the underlying language model.
+    */
+   protected void initializeLM() {
+     if (type.equals("kenlm")) {
+       this.languageModel = new KenLM(ngramOrder, path);
+ 
+     } else if (type.equals("berkeleylm")) {
+       this.languageModel = new LMGrammarBerkeley(ngramOrder, path);
+ 
+     } else {
+       String msg = String.format("* FATAL: Invalid backend lm_type '%s' for LanguageModel", type)
+           + "*        Permissible values for 'lm_type' are 'kenlm' and 'berkeleylm'";
+       throw new RuntimeException(msg);
+     }
+ 
+     Vocabulary.registerLanguageModel(this.languageModel);
+     Vocabulary.id(config.default_non_terminal);
+ 
+     startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+   }
+ 
+   public NGramLanguageModel getLM() {
+     return this.languageModel;
+   }
+ 
++  public boolean isClassLM() {
++  	return this.isClassLM;
++  }
++
+   public String logString() {
 -    if (languageModel != null)
 -      return String.format("%s, order %d (weight %.3f)", name, languageModel.getOrder(), weight);
 -    else
 -      return "WHOA";
++    return String.format("%s, order %d (weight %.3f), classLm=%s", name, languageModel.getOrder(), weight, isClassLM);
+   }
+ 
+   /**
 -   * Computes the features incurred along this edge. Note that these features are unweighted costs
 -   * of the feature; they are the feature cost, not the model cost, or the inner product of them.
++   * Computes the features incurred along this edge. Note that these features
++   * are unweighted costs of the feature; they are the feature cost, not the
++   * model cost, or the inner product of them.
+    */
+   @Override
 -  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
 -      Sentence sentence, Accumulator acc) {
 -
 -    NgramDPState newState = null;
 -    if (rule != null) {
 -      if (config.source_annotations) {
 -        // Get source side annotations and project them to the target side
 -        newState = computeTransition(getTags(rule, i, j, sentence), tailNodes, acc);
 -      }
 -      else {
 -        if (this.isClassLM) {
 -          // Use a class language model
 -          // Return target side classes
 -          newState = computeTransition(getClasses(rule), tailNodes, acc);
 -        }
 -        else {
 -          // Default LM 
 -          newState = computeTransition(rule.getEnglish(), tailNodes, acc);
 -        }
 -      }
++  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
++    SourcePath sourcePath, Sentence sentence, Accumulator acc) {
+ 
++    if (rule == null) {
++      return null;
+     }
+ 
 -    return newState;
++    int[] words;
++    if (config.source_annotations) {
++      // get source side annotations and project them to the target side
++      words = getTags(rule, i, j, sentence);
++    } else {
++      words = getRuleIds(rule);
++    }
++
++    return computeTransition(words, tailNodes, acc);
++
++	}
++
++  /**
++   * Retrieve ids from rule. These are either simply the rule ids on the target
++   * side, their corresponding class map ids, or the configured source-side
++   * annotation tags.
++   */
++  @VisibleForTesting
++  public int[] getRuleIds(final Rule rule) {
++    if (this.isClassLM) {
++      // map words to class ids
++      return getClasses(rule);
++    }
++    // Regular LM: use rule word ids
++    return rule.getEnglish();
+   }
+ 
+   /**
+    * Input sentences can be tagged with information specific to the language model. This looks for
+    * such annotations by following a word's alignments back to the source words, checking for
+    * annotations, and replacing the surface word if such annotations are found.
+    * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule} to use
+    * @param begin todo
+    * @param end todo
+    * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+    * @return todo
+    */
+   protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) {
+     /* Very important to make a copy here, so the original rule is not modified */
+     int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
+     byte[] alignments = rule.getAlignment();
+ 
+     //    System.err.println(String.format("getTags() %s", rule.getRuleString()));
+ 
+     /* For each target-side token, project it to each of its source-language alignments. If any of those
+      * are annotated, take the first annotation and quit.
+      */
+     if (alignments != null) {
+       for (int i = 0; i < tokens.length; i++) {
+         if (tokens[i] > 0) { // skip nonterminals
+           for (int j = 0; j < alignments.length; j += 2) {
+             if (alignments[j] == i) {
+               String annotation = sentence.getAnnotation((int)alignments[i] + begin, "class");
+               if (annotation != null) {
 -                //                System.err.println(String.format("  word %d source %d abs %d annotation %d/%s", 
++                //                System.err.println(String.format("  word %d source %d abs %d annotation %d/%s",
+                 //                    i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation)));
+                 tokens[i] = Vocabulary.id(annotation);
+                 break;
+               }
+             }
+           }
+         }
+       }
+     }
+ 
+     return tokens;
+   }
+ 
 -  /** 
 -   * Sets the class map if this is a class LM 
++  /**
++   * Sets the class map if this is a class LM
+    * @param fileName a string path to a file
+    * @throws IOException if there is an error reading the input file
+    */
+   public void setClassMap(String fileName) throws IOException {
+     this.classMap = new ClassMap(fileName);
+   }
+ 
+   /**
+    * Replace each word in a rule with the target side classes.
+    * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to use when obtaining tokens
+    * @return int[] of tokens
+    */
+   protected int[] getClasses(Rule rule) {
+     if (this.classMap == null) {
+       throw new RuntimeException("The class map is not set. Cannot use the class LM ");
+     }
+     /* Very important to make a copy here, so the original rule is not modified */
+     int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
+     for (int i = 0; i < tokens.length; i++) {
+       if (tokens[i] > 0 ) {
+         tokens[i] = this.classMap.getClassID(tokens[i]);
+       }
+     }
+     return tokens;
+   }
+ 
+   @Override
+   public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
+       Accumulator acc) {
+     return computeFinalTransition((NgramDPState) tailNode.getDPState(stateIndex), acc);
+   }
+ 
+   /**
+    * This function computes all the complete n-grams found in the rule, as well as the incomplete
+    * n-grams on the left-hand side.
+    */
+   @Override
+   public float estimateCost(Rule rule, Sentence sentence) {
+ 
+     float estimate = 0.0f;
+     boolean considerIncompleteNgrams = true;
+ 
 -    int[] enWords = rule.getEnglish();
++    int[] enWords = getRuleIds(rule);
+ 
+     List<Integer> words = new ArrayList<Integer>();
+     boolean skipStart = (enWords[0] == startSymbolId);
+ 
+     /*
+      * Move through the words, accumulating language model costs each time we have an n-gram (n >=
+      * 2), and resetting the series of words when we hit a nonterminal.
+      */
+     for (int c = 0; c < enWords.length; c++) {
+       int currentWord = enWords[c];
+       if (FormatUtils.isNonterminal(currentWord)) {
+         estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+         words.clear();
+         skipStart = false;
+       } else {
+         words.add(currentWord);
+       }
+     }
+     estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+ 
+     return weight * estimate;
+   }
+ 
+   /**
+    * Estimates the future cost of a rule. For the language model feature, this is the sum of the
+    * costs of the leftmost k-grams, k = [1..n-1].
+    */
+   @Override
+   public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+     NgramDPState state = (NgramDPState) currentState;
+ 
+     float estimate = 0.0f;
+     int[] leftContext = state.getLeftLMStateWords();
+ 
+     if (null != leftContext) {
+       boolean skipStart = true;
+       if (leftContext[0] != startSymbolId) {
+         skipStart = false;
+       }
+       estimate += scoreChunkLogP(leftContext, true, skipStart);
+     }
+     return weight * estimate;
+   }
+ 
+   /**
+    * Compute the cost of a rule application. The cost of applying a rule is computed by determining
+    * the n-gram costs for all n-grams created by this rule application, and summing them. N-grams
+    * are created when (a) terminal words in the rule string are followed by a nonterminal (b)
+    * terminal words in the rule string are preceded by a nonterminal (c) we encounter adjacent
+    * nonterminals. In all of these situations, the corresponding boundary words of the node in the
+    * hypergraph represented by the nonterminal must be retrieved.
 -   * 
++   *
+    * IMPORTANT: only complete n-grams are scored. This means that hypotheses with fewer words
+    * than the complete n-gram state remain *unscored*. This fact adds a lot of complication to the
+    * code, including the use of the computeFinal* family of functions, which correct this fact for
+    * sentences that are too short on the final transition.
+    */
+   private NgramDPState computeTransition(int[] enWords, List<HGNode> tailNodes, Accumulator acc) {
+ 
+     int[] current = new int[this.ngramOrder];
+     int[] shadow = new int[this.ngramOrder];
+     int ccount = 0;
+     float transitionLogP = 0.0f;
+     int[] left_context = null;
+ 
+     for (int c = 0; c < enWords.length; c++) {
+       int curID = enWords[c];
+ 
+       if (FormatUtils.isNonterminal(curID)) {
+         int index = -(curID + 1);
+ 
+         NgramDPState state = (NgramDPState) tailNodes.get(index).getDPState(stateIndex);
+         int[] left = state.getLeftLMStateWords();
+         int[] right = state.getRightLMStateWords();
+ 
+         // Left context.
+         for (int i = 0; i < left.length; i++) {
+           current[ccount++] = left[i];
+ 
+           if (left_context == null && ccount == this.ngramOrder - 1)
+             left_context = Arrays.copyOf(current, ccount);
+ 
+           if (ccount == this.ngramOrder) {
+             // Compute the current word probability, and remove it.
+             float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
+             //            System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
+             transitionLogP += prob;
+             System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
+             int[] tmp = current;
+             current = shadow;
+             shadow = tmp;
+             --ccount;
+           }
+         }
+         System.arraycopy(right, 0, current, ccount - right.length, right.length);
+       } else { // terminal words
+         current[ccount++] = curID;
+ 
+         if (left_context == null && ccount == this.ngramOrder - 1)
+           left_context = Arrays.copyOf(current, ccount);
+ 
+         if (ccount == this.ngramOrder) {
+           // Compute the current word probability, and remove it.s
+           float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
+           //          System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
+           transitionLogP += prob;
+           System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
+           int[] tmp = current;
+           current = shadow;
+           shadow = tmp;
+           --ccount;
+         }
+       }
+     }
+     //    acc.add(name, transitionLogP);
+     acc.add(denseFeatureIndex, transitionLogP);
+ 
+     if (left_context != null) {
+       return new NgramDPState(left_context, Arrays.copyOfRange(current, ccount - this.ngramOrder
+           + 1, ccount));
+     } else {
+       int[] context = Arrays.copyOf(current, ccount);
+       return new NgramDPState(context, context);
+     }
+   }
+ 
+   /**
+    * This function differs from regular transitions because we incorporate the cost of incomplete
+    * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
+    * requested when the object was created).
 -   * 
++   *
+    * @param state the dynamic programming state
+    * @return the final transition probability (including incomplete n-grams)
+    */
+   private NgramDPState computeFinalTransition(NgramDPState state, Accumulator acc) {
+ 
+     //    System.err.println(String.format("LanguageModel::computeFinalTransition()"));
+ 
+     float res = 0.0f;
+     LinkedList<Integer> currentNgram = new LinkedList<Integer>();
+     int[] leftContext = state.getLeftLMStateWords();
+     int[] rightContext = state.getRightLMStateWords();
+ 
+     for (int i = 0; i < leftContext.length; i++) {
+       int t = leftContext[i];
+       currentNgram.add(t);
+ 
+       if (currentNgram.size() >= 2) { // start from bigram
+         float prob = this.languageModel.ngramLogProbability(Support.toArray(currentNgram),
+             currentNgram.size());
+         res += prob;
+       }
+       if (currentNgram.size() == this.ngramOrder)
+         currentNgram.removeFirst();
+     }
+ 
+     // Tell the accumulator
+     //    acc.add(name, res);
+     acc.add(denseFeatureIndex, res);
+ 
+     // State is the same
+     return new NgramDPState(leftContext, rightContext);
+   }
+ 
+ 
+   /**
+    * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)}
+    */
+   private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+       boolean skipStart) {
+     return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart);
+   }
+ 
+   /**
+    * This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It
+    * computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1
+    * words.
 -   * 
++   *
+    * @param words
+    * @param considerIncompleteNgrams
+    * @param skipStart
+    * @return the phrase log probability
+    */
+   private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams,
+       boolean skipStart) {
+ 
+     float score = 0.0f;
+     if (words.length > 0) {
+       int startIndex;
+       if (!considerIncompleteNgrams) {
+         startIndex = this.ngramOrder;
+       } else if (skipStart) {
+         startIndex = 2;
+       } else {
+         startIndex = 1;
+       }
+       score = this.languageModel.sentenceLogProbability(words, this.ngramOrder, startIndex);
+     }
+ 
+     return score;
+   }
+ 
+   /**
+    * Public method to set LM_INDEX back to 0.
+    * Required if multiple instances of the JoshuaDecoder live in the same JVM.
+    */
+   public static void resetLmIndex() {
+     LM_INDEX = 0;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 0000000,533365c..88dc647
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@@ -1,0 -1,202 +1,193 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.lm;
+ 
 -import java.util.ArrayList;
++import static org.apache.joshua.util.FormatUtils.isNonterminal;
++
+ import java.util.List;
+ import java.util.concurrent.ConcurrentHashMap;
+ 
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
 -import org.apache.joshua.decoder.ff.lm.KenLM;
+ import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.state_maintenance.KenLMState;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
 -import org.apache.joshua.util.FormatUtils;
+ 
+ /**
+  * Wrapper for KenLM LMs with left-state minimization. We inherit from the regular
 - * 
++ *
+  * @author Matt Post post@cs.jhu.edu
+  * @author Juri Ganitkevitch juri@cs.jhu.edu
+  */
+ public class StateMinimizingLanguageModel extends LanguageModelFF {
+ 
+   // maps from sentence numbers to KenLM-side pools used to allocate state
+   private static final ConcurrentHashMap<Integer, Long> poolMap = new ConcurrentHashMap<Integer, Long>();
+ 
+   public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+     super(weights, args, config);
+     this.type = "kenlm";
+     if (parsedArgs.containsKey("lm_type") && ! parsedArgs.get("lm_type").equals("kenlm")) {
+       String msg = "* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend"
+           + "*        Remove lm_type from line or set to 'kenlm'";
+       throw new RuntimeException(msg);
+     }
+   }
 -  
 -  @Override
 -  public ArrayList<String> reportDenseFeatures(int index) {
 -    denseFeatureIndex = index;
 -    
 -    ArrayList<String> names = new ArrayList<String>();
 -    names.add(name);
 -    return names;
 -  }
+ 
+   /**
+    * Initializes the underlying language model.
+    */
+   @Override
+   public void initializeLM() {
 -    
++
+     // Override type (only KenLM supports left-state minimization)
+     this.languageModel = new KenLM(ngramOrder, path);
+ 
+     Vocabulary.registerLanguageModel(this.languageModel);
+     Vocabulary.id(config.default_non_terminal);
 -    
++
+   }
 -  
++
+   /**
 -   * Estimates the cost of a rule. We override here since KenLM can do it more efficiently
 -   * than the default {@link LanguageModelFF} class.
 -   *    
 -   * Most of this function implementation is redundant with compute().
++   * Estimates the cost of a rule. We override here since KenLM can do it more
++   * efficiently than the default {@link LanguageModelFF} class.
+    */
+   @Override
+   public float estimateCost(Rule rule, Sentence sentence) {
 -    
 -    int[] ruleWords = rule.getEnglish();
 -
 -    // The IDs we'll pass to KenLM
 -    long[] words = new long[ruleWords.length];
+ 
 -    for (int x = 0; x < ruleWords.length; x++) {
 -      int id = ruleWords[x];
++    int[] ruleWords = getRuleIds(rule);
+ 
 -      if (FormatUtils.isNonterminal(id)) {
 -        // For the estimate, we can just mark negative values
 -        words[x] = -1;
++    // map to ken lm ids
++    final long[] words = mapToKenLmIds(ruleWords, null, true);
+ 
 -      } else {
 -        // Terminal: just add it
 -        words[x] = id;
 -      }
 -    }
 -    
+     // Get the probability of applying the rule and the new state
+     return weight * ((KenLM) languageModel).estimateRule(words);
+   }
 -  
++
+   /**
+    * Computes the features incurred along this edge. Note that these features are unweighted costs
+    * of the feature; they are the feature cost, not the model cost, or the inner product of them.
+    */
+   @Override
+   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+       Sentence sentence, Accumulator acc) {
+ 
 -    int[] ruleWords = config.source_annotations 
 -        ? getTags(rule, i, j, sentence)
 -        : rule.getEnglish();
 -
 -    // The IDs we'll pass to KenLM
 -    long[] words = new long[ruleWords.length];
++    if (rule == null ) {
++      return null;
++    }
+ 
 -    for (int x = 0; x < ruleWords.length; x++) {
 -      int id = ruleWords[x];
++    int[] ruleWords;
++    if (config.source_annotations) {
++      // get source side annotations and project them to the target side
++      ruleWords = getTags(rule, i, j, sentence);
++    } else {
++      ruleWords = getRuleIds(rule);
++    }
+ 
 -      if (FormatUtils.isNonterminal(id)) {
 -        // Nonterminal: retrieve the KenLM long that records the state
 -        int index = -(id + 1);
 -        KenLMState state = (KenLMState) tailNodes.get(index).getDPState(stateIndex);
 -        words[x] = -state.getState();
++     // map to ken lm ids
++    final long[] words = mapToKenLmIds(ruleWords, tailNodes, false);
+ 
 -      } else {
 -        // Terminal: just add it
 -        words[x] = id;
 -      }
 -    }
 -    
 -    int sentID = sentence.id();
++    final int sentID = sentence.id();
+     // Since sentId is unique across threads, next operations are safe, but not atomic!
+     if (!poolMap.containsKey(sentID)) {
+       poolMap.put(sentID, KenLM.createPool());
+     }
+ 
+     // Get the probability of applying the rule and the new state
 -    StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
++    final StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
+ 
+     // Record the prob
 -//    acc.add(name, pair.prob);
+     acc.add(denseFeatureIndex, pair.prob);
+ 
+     // Return the state
+     return pair.state;
+   }
+ 
+   /**
++   * Maps given array of word/class ids to KenLM ids. For estimating cost and computing,
++   * state retrieval differs slightly.
++   */
++  private long[] mapToKenLmIds(int[] ids, List<HGNode> tailNodes, boolean isOnlyEstimate) {
++    // The IDs we will to KenLM
++    long[] kenIds = new long[ids.length];
++    for (int x = 0; x < ids.length; x++) {
++      int id = ids[x];
++
++      if (isNonterminal(id)) {
++
++        if (isOnlyEstimate) {
++          // For the estimate, we can just mark negative values
++          kenIds[x] = -1;
++        } else {
++          // Nonterminal: retrieve the KenLM long that records the state
++          int index = -(id + 1);
++          final KenLMState state = (KenLMState) tailNodes.get(index).getDPState(stateIndex);
++          kenIds[x] = -state.getState();
++        }
++
++      } else {
++        // Terminal: just add it
++        kenIds[x] = id;
++      }
++    }
++    return kenIds;
++  }
++
++  /**
+    * Destroys the pool created to allocate state for this sentence. Called from the
+    * {@link org.apache.joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting
+    * this map here in KenLMFF statically allows pools to be shared across KenLM instances.
 -   * 
++   *
+    * @param sentId a key in the poolmap table to destroy
+    */
+   public void destroyPool(int sentId) {
+     if (poolMap.containsKey(sentId))
+       KenLM.destroyPool(poolMap.get(sentId));
+     poolMap.remove(sentId);
+   }
+ 
+   /**
+    * This function differs from regular transitions because we incorporate the cost of incomplete
+    * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
+    * requested when the object was created).
 -   * 
++   *
+    * KenLM already includes the prefix probabilities (of shorter n-grams on the left-hand side), so
+    * there's nothing that needs to be done.
+    */
+   @Override
+   public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
+       Accumulator acc) {
 -
 -    // KenLMState state = (KenLMState) tailNode.getDPState(getStateIndex());
 -
 -    // This is unnecessary
 -    // acc.add(name, 0.0f);
 -
+     // The state is the same since no rule was applied
+     return new KenLMState();
+   }
+ 
+   /**
+    * KenLM probs already include the prefix probabilities (they are substracted out when merging
+    * states), so this doesn't need to do anything.
+    */
+   @Override
+   public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+     return 0.0f;
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
index 0000000,6c512bd..bc66331
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
@@@ -1,0 -1,228 +1,229 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.tm;
+ 
+ import java.util.Arrays;
+ import java.util.HashSet;
+ import java.util.List;
+ 
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.phrase.PhraseTable;
+ import org.apache.joshua.decoder.segment_file.Token;
+ import org.apache.joshua.lattice.Arc;
+ import org.apache.joshua.lattice.Lattice;
+ import org.apache.joshua.lattice.Node;
 -
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
++import cern.colt.Arrays;
++
+ /**
+  * Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
+  * grammar.
+  * <p>
+  * <em>Note</em>: New classes implementing the <code>Grammar</code> interface should probably
+  * inherit from this class, unless a specific sorting technique different from that implemented by
+  * this class is required.
+  *
+  * @author Zhifei Li
+  * @author Lane Schwartz
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public abstract class AbstractGrammar implements Grammar {
+ 
+   /** Logger for this class. */
+   private static final Logger LOG = LoggerFactory.getLogger(AbstractGrammar.class);
+   /**
+    * Indicates whether the rules in this grammar have been sorted based on the latest feature
+    * function values.
+    */
+   protected boolean sorted = false;
+ 
+   /*
+    * The grammar's owner, used to determine which weights are applicable to the dense features found
+    * within.
+    */
 -  protected int owner = -1;
 -
++  protected final OwnerId owner;
++  
+   /*
+    * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
+    */
+   protected int maxSourcePhraseLength = -1;
+ 
+     /**
+    * Returns the longest source phrase read.
+    *
+    * @return the longest source phrase read (nonterminal + terminal symbols).
+    */
+   @Override
+   public int getMaxSourcePhraseLength() {
+     return maxSourcePhraseLength;
+   }
+ 
+   @Override
 -  public int getOwner() {
++  public OwnerId getOwner() {
+     return owner;
+   }
++  
++  public int getSpanLimit() {
++    return spanLimit;
++  }
+ 
 -  /* The maximum span of the input this rule can be applied to. */
 -  protected int spanLimit = 1;
++  /* The maximum span of the input this grammar rules can be applied to. */
++  protected final int spanLimit;
+ 
 -  protected JoshuaConfiguration joshuaConfiguration;
++  protected final JoshuaConfiguration joshuaConfiguration;
+ 
+   /**
 -   * Constructs an empty, unsorted grammar.
 -   *
++   * Creates an empty, unsorted grammar with given owner and spanlimit
++   * 
+    * @see Grammar#isSorted()
+    * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+    */
 -  public AbstractGrammar(JoshuaConfiguration config) {
 -    this.joshuaConfiguration = config;
++  public AbstractGrammar(final String owner, final JoshuaConfiguration config, final int spanLimit) {
+     this.sorted = false;
 -  }
 -
 -  public AbstractGrammar(int owner, int spanLimit) {
 -    this.sorted = false;
 -    this.owner = owner;
++    this.owner = OwnerMap.register(owner);
++    this.joshuaConfiguration = config;
+     this.spanLimit = spanLimit;
+   }
+ 
+   public static final int OOV_RULE_ID = 0;
+ 
+   /**
+    * Cube-pruning requires that the grammar be sorted based on the latest feature functions. To
+    * avoid synchronization, this method should be called before multiple threads are initialized for
+    * parallel decoding
+    * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+    */
+   public void sortGrammar(List<FeatureFunction> models) {
+     Trie root = getTrieRoot();
+     if (root != null) {
+       sort(root, models);
+       setSorted(true);
+     }
+   }
+ 
+   /* See Javadoc comments for Grammar interface. */
+   public boolean isSorted() {
+     return sorted;
+   }
+ 
+   /**
+    * Sets the flag indicating whether this grammar is sorted.
+    * <p>
+    * This method is called by {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar#sortGrammar(List)}
+    * to indicate that the grammar has been sorted.</p>
+    *
+    * <p>Its scope is protected so that child classes that override <code>sortGrammar</code> will also
+    * be able to call this method to indicate that the grammar has been sorted.</p>
+    *
+    * @param sorted set to true if the grammar is sorted
+    */
+   protected void setSorted(boolean sorted) {
+     this.sorted = sorted;
+     LOG.debug("This grammar is now sorted: {}",  this);
+   }
+ 
+   /**
+    * Recursively sorts the grammar using the provided feature functions.
+    * <p>
+    * This method first sorts the rules stored at the provided node, then recursively calls itself on
+    * the child nodes of the provided node.
+    *
+    * @param node Grammar node in the <code>Trie</code> whose rules should be sorted.
+    * @param models Feature function models to use during sorting.
+    */
+   private void sort(Trie node, List<FeatureFunction> models) {
+ 
+     if (node != null) {
+       if (node.hasRules()) {
+         RuleCollection rules = node.getRuleCollection();
+         LOG.debug("Sorting node {}", Arrays.toString(rules.getSourceSide()));
+ 
+         /* This causes the rules at this trie node to be sorted */
+         rules.getSortedRules(models);
+ 
+         if (LOG.isDebugEnabled()) {
+           StringBuilder s = new StringBuilder();
+           for (Rule r : rules.getSortedRules(models)) {
+             s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| "
+                 + Arrays.toString(r.getEnglish()) + " ||| " + r.getFeatureVector() + " ||| "
+                 + r.getEstimatedCost() + "  " + r.getClass().getName() + "@"
+                 + Integer.toHexString(System.identityHashCode(r)));
+           }
+           LOG.debug("{}", s);
+         }
+       }
+ 
+       if (node.hasExtensions()) {
+         for (Trie child : node.getExtensions()) {
+           sort(child, models);
+         }
+       } else {
+         LOG.debug("Node has 0 children to extend: {}", node);
+       }
+     }
+   }
+ 
+   // write grammar to disk
+   public void writeGrammarOnDisk(String file) {
+   }
+ 
+   /**
+    * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
+    * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
+    *
+    * @param grammar Grammar in the Trie
+    * @param inputLattice the lattice representing the input sentence
+    * @param featureFunctions a list of feature functions used for scoring
+    * @param onlyTrue determine if word is actual OOV.
+    */
+   public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice,
+       List<FeatureFunction> featureFunctions, boolean onlyTrue) {
+     /*
+      * Add OOV rules; This should be called after the manual constraints have
+      * been set up.
+      */
+     HashSet<Integer> words = new HashSet<Integer>();
+     for (Node<Token> node : inputLattice) {
+       for (Arc<Token> arc : node.getOutgoingArcs()) {
+         // create a rule, but do not add into the grammar trie
+         // TODO: which grammar should we use to create an OOV rule?
+         int sourceWord = arc.getLabel().getWord();
+         if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
+             || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
+           continue;
+ 
+         // Determine if word is actual OOV.
+         if (onlyTrue && ! Vocabulary.hasId(sourceWord))
+           continue;
+ 
+         words.add(sourceWord);
+       }
+     }
+ 
+     for (int sourceWord: words)
+       grammar.addOOVRules(sourceWord, featureFunctions);
+ 
+     // Sort all the rules (not much to actually do, this just marks it as sorted)
+     grammar.sortGrammar(featureFunctions);
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
index 0000000,06252a1..8f90d1b
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@@ -1,0 -1,120 +1,120 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.tm;
+ 
+ import java.util.List;
+ 
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ 
+ /**
+  * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata.
+  * 
+  * @author wren ng thornton wren@users.sourceforge.net
+  * @author Zhifei Li, zhifei.work@gmail.com
+  */
+ public interface Grammar {
+ 
+   /**
+    * Gets the root of the <code>Trie</code> backing this grammar.
+    * <p>
+    * <em>Note</em>: This method should run as a small constant-time function.
+    * 
+    * @return the root of the <code>Trie</code> backing this grammar
+    */
+   Trie getTrieRoot();
+ 
+   /**
+    * After calling this method, the rules in this grammar are guaranteed to be sorted based on the
+    * latest feature function values.
+    * <p>
+    * Cube-pruning requires that the grammar be sorted based on the latest feature functions.
+    * 
+    * @param models list of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+    */
+   void sortGrammar(List<FeatureFunction> models);
+ 
+   /**
+    * Determines whether the rules in this grammar have been sorted based on the latest feature
+    * function values.
+    * <p>
+    * This method is needed for the cube-pruning algorithm.
+    * 
+    * @return <code>true</code> if the rules in this grammar have been sorted based on the latest
+    *         feature function values, <code>false</code> otherwise
+    */
+   boolean isSorted();
+ 
+   /**
+    * Returns whether this grammar has any valid rules for covering a particular span of a sentence.
+    * Hiero's "glue" grammar will only say True if the span is longer than our span limit, and is
+    * anchored at startIndex==0. Hiero's "regular" grammar will only say True if the span is less
+    * than the span limit. Other grammars, e.g. for rule-based systems, may have different behaviors.
+    * 
+    * @param startIndex Indicates the starting index of a phrase in a source input phrase, or a
+    *          starting node identifier in a source input lattice
+    * @param endIndex Indicates the ending index of a phrase in a source input phrase, or an ending
+    *          node identifier in a source input lattice
+    * @param pathLength Length of the input path in a source input lattice. If a source input phrase
+    *          is used instead of a lattice, this value will likely be ignored by the underlying
+    *          implementation, but would normally be defined as <code>endIndex-startIndex</code>
+    * @return true if there is a rule for this span
+    */
+   boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength);
+ 
+   /**
+    * Gets the number of rules stored in the grammar.
+    * 
+    * @return the number of rules stored in the grammar
+    */
+   int getNumRules();
+   
+   /**
+    * Returns the number of dense features.
+    * 
+    * @return the number of dense features
+    */
+   int getNumDenseFeatures();
+ 
+   /**
+    * Return the grammar's owner.
+    * @return grammar owner
+    */
 -  int getOwner();
++  OwnerId getOwner();
+ 
+   /**
+    * Return the maximum source phrase length (terminals + nonterminals)
+    * @return the maximum source phrase length
+    */
+   int getMaxSourcePhraseLength();
+   
+   /**
+    * Add an OOV rule for the requested word for the grammar.
+    * 
+    * @param word input word to add rules to
+    * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+    */
+   void addOOVRules(int word, List<FeatureFunction> featureFunctions);
+   
+   /**
+    * Add a rule to the grammar.
+    *
+    * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
+    */
+   void addRule(Rule rule);
+ }