You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:46:08 UTC

[57/60] incubator-joshua git commit: Merge branch 'maven-multi-module' of https://github.com/logogin/incubator-joshua into maven-multi-module

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
index 0000000,0e5a4bc..15fbec1
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
@@@ -1,0 -1,633 +1,635 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.tm;
+ 
++import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
++
+ import java.util.ArrayList;
+ import java.util.Arrays;  
+ import java.util.Comparator;
+ import java.util.HashMap;
+ import java.util.List;
+ import java.util.Map;
+ import java.util.regex.Pattern;
+ 
+ import com.google.common.base.Supplier;
+ import com.google.common.base.Suppliers;
+ 
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+ /**
+  * This class define the interface for Rule. 
+  * 
+  * All feature scores are interpreted as negative log probabilities, and are therefore negated.
+  * Note that not all features need to be negative log probs, but you should be aware that they
+  * will be negated, so if you want a positive count, it should come in as negative.
+  * 
+  * Normally, the feature score in the rule should be *cost* (i.e., -LogP), so that the feature
+  * weight should be positive
+  * 
+  * @author Zhifei Li, zhifei.work@gmail.com
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public class Rule implements Comparator<Rule>, Comparable<Rule> {
+ 
+   private static final Logger LOG = LoggerFactory.getLogger(Rule.class);
+   private int lhs; // tag of this rule
+   private int[] source; // pointer to the RuleCollection, as all the rules under it share the same
+                          // Source side
+   protected int arity;
+ 
+   // And a string containing the sparse ones
+   //protected final String sparseFeatureString;
+   protected final Supplier<String> sparseFeatureStringSupplier;
+   private final Supplier<FeatureVector> featuresSupplier;
+ 
+   /*
+    * a feature function will be fired for this rule only if the owner of the rule matches the owner
+    * of the feature function
+    */
 -  private int owner = -1;
++  private OwnerId owner = UNKNOWN_OWNER_ID;
+ 
+   /**
+    * This is the cost computed only from the features present with the grammar rule. This cost is
+    * needed to sort the rules in the grammar for cube pruning, but isn't the full cost of applying
+    * the rule (which will include contextual features that can't be computed until the rule is
+    * applied).
+    */
+   private float estimatedCost = Float.NEGATIVE_INFINITY;
+ 
+   private float precomputableCost = Float.NEGATIVE_INFINITY;
+ 
+   private int[] target;
+ 
+   // The alignment string, e.g., 0-0 0-1 1-1 2-1
+   private String alignmentString;
+   private final Supplier<byte[]> alignmentSupplier;
+ 
+   /**
+    * Constructs a new rule using the provided parameters. Rule id for this rule is
+    * undefined. Note that some of the sparse features may be unlabeled, but they cannot be mapped to
+    * their default names ("tm_OWNER_INDEX") until later, when we know the owner of the rule. This is
+    * not known until the rule is actually added to a grammar in Grammar::addRule().
+    * 
+    * Constructor used by other constructors below;
+    * 
+    * @param lhs Left-hand side of the rule.
+    * @param source Source language right-hand side of the rule.
+    * @param target Target language right-hand side of the rule.
+    * @param sparseFeatures Feature value scores for the rule.
+    * @param arity Number of nonterminals in the source language right-hand side.
+    * @param owner todo
+    */
 -  public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) {
++  public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, OwnerId owner) {
+     this.lhs = lhs;
+     this.source = source;
+     this.arity = arity;
+     this.owner = owner;
+     this.target = target;
+     this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
+     this.featuresSupplier = initializeFeatureSupplierFromString();
+     this.alignmentSupplier = initializeAlignmentSupplier();
+   }
+   
+   /**
+    * Constructor used by PackedGrammar's sortRules()
+    * @param lhs todo
+    * @param sourceRhs todo
+    * @param targetRhs todo
+    * @param features todo
+    * @param arity todo
+    * @param owner todo
+    */
 -  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
++  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, OwnerId owner) {
+     this.lhs = lhs;
+     this.source = sourceRhs;
+     this.arity = arity;
+     this.owner = owner;
+     this.target = targetRhs;
+     this.featuresSupplier = Suppliers.memoize(() -> { return features; });
+     this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+     this.alignmentSupplier = initializeAlignmentSupplier();
+   }
+ 
+   /**
+    * Constructor used for SamtFormatReader and GrammarBuilderWalkerFunction's getRuleWithSpans()
 -   * Owner set to -1
++   * Rule is unowned.
+    * @param lhs todo
+    * @param sourceRhs todo
+    * @param targetRhs todo
+    * @param sparseFeatures todo
+    * @param arity todo
+    */
+   public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) {
 -    this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1);
++    this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, OwnerMap.UNKNOWN_OWNER_ID);
+   }
+ 
+   /**
+    * Constructor used for addOOVRules(), HieroFormatReader and PhraseRule.
+    * @param lhs todo
+    * @param sourceRhs todo
+    * @param targetRhs todo
+    * @param sparseFeatures todo
+    * @param arity todo
+    * @param alignment todo
+    */
+   public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) {
+     this(lhs, sourceRhs, targetRhs, sparseFeatures, arity);
+     this.alignmentString = alignment;
+   }
+   
+   /**
+    * Constructor (implicitly) used by PackedRule
+    */
+   public Rule() {
+     this.lhs = -1;
+     this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+     this.featuresSupplier = initializeFeatureSupplierFromString();
+     this.alignmentSupplier = initializeAlignmentSupplier();
+   }
+ 
+   // ==========================================================================
+   // Lazy loading Suppliers for alignments, feature vector, and feature strings
+   // ==========================================================================
+   
+   private Supplier<byte[]> initializeAlignmentSupplier(){
+     return Suppliers.memoize(() ->{
+       byte[] alignment = null;
+       String alignmentString = getAlignmentString();
+       if (alignmentString != null) {
+         String[] tokens = alignmentString.split("[-\\s]+");
+         alignment = new byte[tokens.length];
+         for (int i = 0; i < tokens.length; i++)
+           alignment[i] = (byte) Short.parseShort(tokens[i]);
+       }
+       return alignment;
+     });
+   }
+   
+   /**
+    * If Rule was constructed with sparseFeatures String, we lazily populate the
+    * FeatureSupplier.
+    */
+   private Supplier<FeatureVector> initializeFeatureSupplierFromString(){
+     return Suppliers.memoize(() ->{
 -      if (owner != -1) {
 -        return new FeatureVector(getFeatureString(), "tm_" + Vocabulary.word(owner) + "_");
++      if (!owner.equals(UNKNOWN_OWNER_ID)) {
++        return new FeatureVector(getFeatureString(), "tm_" + OwnerMap.getOwner(owner) + "_");
+       } else {
+         return new FeatureVector();
+       }
+     });
+   }
+   
+   /**
+    * If Rule was constructed with a FeatureVector, we lazily populate the sparseFeaturesStringSupplier.
+    */
+   private Supplier<String> initializeSparseFeaturesStringSupplier() {
+     return Suppliers.memoize(() -> {
+       return getFeatureVector().toString();
+     });
+   }
+ 
+   // ===============================================================
+   // Attributes
+   // ===============================================================
+ 
+   public void setEnglish(int[] eng) {
+     this.target = eng;
+   }
+ 
+   public int[] getEnglish() {
+     return this.target;
+   }
+ 
+   /**
+    * Two Rules are equal of they have the same LHS, the same source RHS and the same target
+    * RHS.
+    * 
+    * @param o the object to check for equality
+    * @return true if o is the same Rule as this rule, false otherwise
+    */
+   public boolean equals(Object o) {
+     if (!(o instanceof Rule)) {
+       return false;
+     }
+     Rule other = (Rule) o;
+     if (getLHS() != other.getLHS()) {
+       return false;
+     }
+     if (!Arrays.equals(getFrench(), other.getFrench())) {
+       return false;
+     }
+     if (!Arrays.equals(target, other.getEnglish())) {
+       return false;
+     }
+     return true;
+   }
+ 
+   public int hashCode() {
+     // I just made this up. If two rules are equal they'll have the
+     // same hashcode. Maybe someone else can do a better job though?
+     int frHash = Arrays.hashCode(getFrench());
+     int enHash = Arrays.hashCode(target);
+     return frHash ^ enHash ^ getLHS();
+   }
+ 
+   // ===============================================================
+   // Attributes
+   // ===============================================================
+ 
+   public void setArity(int arity) {
+     this.arity = arity;
+   }
+ 
+   public int getArity() {
+     return this.arity;
+   }
+ 
 -  public void setOwner(int owner) {
++  public void setOwner(final OwnerId owner) {
+     this.owner = owner;
+   }
+ 
 -  public int getOwner() {
++  public OwnerId getOwner() {
+     return this.owner;
+   }
+ 
+   public void setLHS(int lhs) {
+     this.lhs = lhs;
+   }
+ 
+   public int getLHS() {
+     return this.lhs;
+   }
+ 
+   public void setFrench(int[] french) {
+     this.source = french;
+   }
+ 
+   public int[] getFrench() {
+     return this.source;
+   }
+ 
+   /**
+    * This function does the work of turning the string version of the sparse features (passed in
+    * when the rule was created) into an actual set of features. This is a bit complicated because we
+    * support intermingled labeled and unlabeled features, where the unlabeled features are mapped to
+    * a default name template of the form "tm_OWNER_INDEX".
+    * 
+    * This function returns the dense (phrasal) features discovered when the rule was loaded. Dense
+    * features are the list of unlabeled features that preceded labeled ones. They can also be
+    * specified as labeled features of the form "tm_OWNER_INDEX", but the former format is preferred.
+    * 
+    * @return the {@link org.apache.joshua.decoder.ff.FeatureVector} for this rule
+    */
+   public FeatureVector getFeatureVector() {
+     return featuresSupplier.get();
+   }
+ 
+   /**
+    * This function returns the estimated cost of a rule, which should have been computed when the
+    * grammar was first sorted via a call to Rule::estimateRuleCost(). This function is a getter
+    * only; it will not compute the value if it has not already been set. It is necessary in addition
+    * to estimateRuleCost(models) because sometimes the value needs to be retrieved from contexts
+    * that do not have access to the feature functions.
+    * 
+    * This function is called by the rule comparator when sorting the grammar. As such it may be
+    * called many times and any implementation of it should be a cached implementation.
+    * 
+    * @return the estimated cost of the rule (a lower bound on the true cost)
+    */
+   public float getEstimatedCost() {
+     return estimatedCost;
+   }
+ 
+   /**
+    * Precomputable costs is the inner product of the weights found on each grammar rule and the
+    * weight vector. This is slightly different from the estimated rule cost, which can include other
+    * features (such as a language model estimate). This getter and setter should also be cached, and
+    * is basically provided to allow the PhraseModel feature to cache its (expensive) computation for
+    * each rule.
+    *
+    * The weights are passed in as dense weights and sparse weights. This allows the dense weight
+    * precomputation to be even faster (since we don't have to query a hash map. 
+    *
+    * @param dense_weights the dense weights from the model
+    * @param weights the sparse weights from the model
+    */
+   public void setPrecomputableCost(float[] dense_weights, FeatureVector weights) {
+     float cost = 0.0f;
+     FeatureVector features = getFeatureVector();
+     for (int i = 0; i < features.getDenseFeatures().size() && i < dense_weights.length; i++) {
+       cost += dense_weights[i] * features.getDense(i);
+     }
+ 
+     for (String key: features.getSparseFeatures().keySet()) {
+       cost += weights.getSparse(key) * features.getSparse(key);
+     }
+     
+     this.precomputableCost = cost;
+   }
+   
+   /**
+    * @return the precomputed model cost of each rule
+    */
+   public float getPrecomputableCost() {
+     return precomputableCost;
+   }
+   
+   public float getDenseFeature(int k) {
+     return getFeatureVector().getDense(k);
+   }
+   
+   /**
+    * This function estimates the cost of a rule, which is used for sorting the rules for cube
+    * pruning. The estimated cost is basically the set of precomputable features (features listed
+    * along with the rule in the grammar file) along with any other estimates that other features
+    * would like to contribute (e.g., a language model estimate). This cost will be a lower bound on
+    * the rule's actual cost.
+    * 
+    * The value of this function is used only for sorting the rules. When the rule is later applied
+    * in context to particular hypernodes, the rule's actual cost is computed.
+    * 
+    * @param models the list of models available to the decoder
+    * @return estimated cost of the rule
+    */
+   public float estimateRuleCost(List<FeatureFunction> models) {
+     if (null == models)
+       return 0.0f;
+ 
+     if (this.estimatedCost <= Float.NEGATIVE_INFINITY) {
+       this.estimatedCost = 0.0f; // weights.innerProduct(computeFeatures());
+ 
+       LOG.debug("estimateCost({} ;; {})", getFrenchWords(), getEnglishWords());
+       for (FeatureFunction ff : models) {
+         float val = ff.estimateCost(this, null);
+         LOG.debug("  FEATURE {} -> {}", ff.getName(), val);
+         this.estimatedCost += val; 
+       }
+     }
+     
+     return estimatedCost;
+   }
+ 
+   // ===============================================================
+   // Methods
+   // ===============================================================
+ 
+   public String toString() {
+     StringBuffer sb = new StringBuffer();
+     sb.append(Vocabulary.word(this.getLHS()));
+     sb.append(" ||| ");
+     sb.append(getFrenchWords());
+     sb.append(" ||| ");
+     sb.append(getEnglishWords());
+     sb.append(" |||");
+     sb.append(" " + getFeatureVector());
+     sb.append(String.format(" ||| est=%.3f", getEstimatedCost()));
+     sb.append(String.format(" pre=%.3f", getPrecomputableCost()));
+     return sb.toString();
+   }
+   
+   /**
+    * Returns a version of the rule suitable for reading in from a text file.
+    * 
+    * @return string version of the rule
+    */
+   public String textFormat() {
+     StringBuffer sb = new StringBuffer();
+     sb.append(Vocabulary.word(this.getLHS()));
+     sb.append(" |||");
+     
+     int nt = 1;
+     for (int i = 0; i < getFrench().length; i++) {
+       if (getFrench()[i] < 0)
+         sb.append(" " + Vocabulary.word(getFrench()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+       else
+         sb.append(" " + Vocabulary.word(getFrench()[i]));
+     }
+     sb.append(" |||");
+     nt = 1;
+     for (int i = 0; i < getEnglish().length; i++) {
+       if (getEnglish()[i] < 0)
+         sb.append(" " + Vocabulary.word(getEnglish()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+       else
+         sb.append(" " + Vocabulary.word(getEnglish()[i]));
+     }
+     sb.append(" |||");
+     sb.append(" " + getFeatureString());
+     if (getAlignmentString() != null)
+       sb.append(" ||| " + getAlignmentString());
+     return sb.toString();
+   }
+ 
+   public String getFeatureString() {
+     return sparseFeatureStringSupplier.get();
+   }
+ 
+   /**
+    * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired,
+    * with position i indexing the source and i+1 the target.
+    * 
+    * @return a byte[] from the {@link com.google.common.base.Supplier}
+    */
+   public byte[] getAlignment() {
+     return this.alignmentSupplier.get();
+   }
+   
+   public String getAlignmentString() {
+     return this.alignmentString;
+   }
+ 
+   /**
+    * The nonterminals on the English side are pointers to the source side nonterminals (-1 and -2),
+    * rather than being directly encoded. These number indicate the correspondence between the
+    * nonterminals on each side, introducing a level of indirection however when we want to resolve
+    * them. So to get the ID, we need to look up the corresponding source side ID.
+    * 
+    * @return The string of English words
+    */
+   public String getEnglishWords() {
+     int[] foreignNTs = getForeignNonTerminals();
+   
+     StringBuilder sb = new StringBuilder();
+     for (Integer index : getEnglish()) {
+       if (index >= 0)
+         sb.append(Vocabulary.word(index) + " ");
+       else
+         sb.append(Vocabulary.word(foreignNTs[-index - 1]).replace("]",
+             String.format(",%d] ", Math.abs(index))));
+     }
+   
+     return sb.toString().trim();
+   }
+ 
+   public boolean isTerminal() {
+     for (int i = 0; i < getEnglish().length; i++)
+       if (getEnglish()[i] < 0)
+         return false;
+   
+     return true;
+   }
+ 
+   /**
+    * Return the French (source) nonterminals as list of Strings
+    * 
+    * @return a list of strings
+    */
+   public int[] getForeignNonTerminals() {
+     int[] nts = new int[getArity()];
+     int index = 0;
+     for (int id : getFrench())
+       if (id < 0)
+         nts[index++] = -id;
+     return nts;
+   }
+   
+   /**
+    * Returns an array of size getArity() containing the source indeces of non terminals.
+    * 
+    * @return an array of size getArity() containing the source indeces of non terminals
+    */
+   public int[] getNonTerminalSourcePositions() {
+     int[] nonTerminalPositions = new int[getArity()];
+     int ntPos = 0;
+     for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) {
+       if (getFrench()[sourceIdx] < 0)
+         nonTerminalPositions[ntPos++] = sourceIdx;
+     }
+     return nonTerminalPositions;
+   }
+   
+   /**
+    * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions.
+    * Used by the WordAlignmentExtractor.
+    * 
+    * @return a {@link java.util.Map} of alignments
+    */
+   public Map<Integer, List<Integer>> getAlignmentMap() {
+     byte[] alignmentArray = getAlignment();
+     Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, List<Integer>>();
+     if (alignmentArray != null) {
+       for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; alignmentIdx += 2 ) {
+         int s = alignmentArray[alignmentIdx];
+         int t = alignmentArray[alignmentIdx + 1];
+         List<Integer> values = alignmentMap.get(t);
+         if (values == null)
+           alignmentMap.put(t, values = new ArrayList<Integer>());
+         values.add(s);
+       }
+     }
+     return alignmentMap;
+   }
+ 
+   /**
+    * Return the English (target) nonterminals as list of Strings
+    * 
+    * @return list of strings
+    */
+   public int[] getEnglishNonTerminals() {
+     int[] nts = new int[getArity()];
+     int[] foreignNTs = getForeignNonTerminals();
+     int index = 0;
+   
+     for (int i : getEnglish()) {
+       if (i < 0)
+         nts[index++] = foreignNTs[Math.abs(getEnglish()[i]) - 1];
+     }
+   
+     return nts;
+   }
+ 
+   private int[] getNormalizedEnglishNonterminalIndices() {
+     int[] result = new int[getArity()];
+   
+     int ntIndex = 0;
+     for (Integer index : getEnglish()) {
+       if (index < 0)
+         result[ntIndex++] = -index - 1;
+     }
+   
+     return result;
+   }
+ 
+   public boolean isInverting() {
+     int[] normalizedEnglishNonTerminalIndices = getNormalizedEnglishNonterminalIndices();
+     if (normalizedEnglishNonTerminalIndices.length == 2) {
+       if (normalizedEnglishNonTerminalIndices[0] == 1) {
+         return true;
+       }
+     }
+     return false;
+   }
+ 
+   public String getFrenchWords() {
+     return Vocabulary.getWords(getFrench());
+   }
+ 
+   public static final String NT_REGEX = "\\[[^\\]]+?\\]";
+ 
+   private Pattern getPattern() {
+     String source = getFrenchWords();
+     String pattern = Pattern.quote(source);
+     pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
+     pattern = pattern.replaceAll("\\\\Q\\\\E", "");
+     pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
+     return Pattern.compile(pattern);
+   }
+ 
+   /**
+    * Matches the string representation of the rule's source side against a sentence
+    * 
+    * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+    * @return true if there is a match
+    */
+   public boolean matches(Sentence sentence) {
+     boolean match = getPattern().matcher(sentence.fullSource()).find();
+     // System.err.println(String.format("match(%s,%s) = %s", Pattern.quote(getFrenchWords()),
+     // sentence.annotatedSource(), match));
+     return match;
+   }
+ 
+   /**
+    * This comparator is used for sorting the rules during cube pruning. An estimate of the cost
+    * of each rule is computed and used to sort. 
+    */
+   public static Comparator<Rule> EstimatedCostComparator = new Comparator<Rule>() {
+     public int compare(Rule rule1, Rule rule2) {
+       float cost1 = rule1.getEstimatedCost();
+       float cost2 = rule2.getEstimatedCost();
+       return Float.compare(cost2,  cost1);
+     }
+   };
+   
+   public int compare(Rule rule1, Rule rule2) {
+     return EstimatedCostComparator.compare(rule1, rule2);
+   }
+ 
+   public int compareTo(Rule other) {
+     return EstimatedCostComparator.compare(this, other);
+   }
+ 
+   public String getRuleString() {
+     return String.format("%s -> %s ||| %s", Vocabulary.word(getLHS()), getFrenchWords(), getEnglishWords());
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
index 0000000,c952b05..54f68b2
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
@@@ -1,0 -1,366 +1,366 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.tm;
+ 
+ import java.util.Collection;
+ import java.util.HashMap;
+ import java.util.Iterator;
+ import java.util.Map.Entry;
+ 
+ import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+ import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+ /**
+  * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
+  * trie, a subset of the original trie, that only contains trie paths that are reachable from
+  * traversals of the current sentence.
+  * 
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
+ 
+   private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
+ 
+   private AbstractGrammar baseGrammar;
+   private SentenceFilteredTrie filteredTrie;
+   private int[] tokens;
+   private Sentence sentence;
+ 
+   /**
+    * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
+    * from the base grammar, which contains the complete grammar).
+    * 
+    * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
+    * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+    */
+   SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
 -    super(baseGrammar.joshuaConfiguration);
++    super(OwnerMap.getOwner(baseGrammar.getOwner()), baseGrammar.joshuaConfiguration, baseGrammar.getSpanLimit());
+     this.baseGrammar = baseGrammar;
+     this.sentence = sentence;
+     this.tokens = sentence.getWordIDs();
+ 
+     int origCount = getNumRules(baseGrammar.getTrieRoot());
+     long startTime = System.currentTimeMillis();
+ 
+     /* Filter the rules; returns non-null object */
+     this.filteredTrie = filter(baseGrammar.getTrieRoot());
+     int filteredCount = getNumRules();
+ 
+     float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
+ 
+     LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
+         sentence.id(), origCount, filteredCount, seconds);
+   }
+ 
+   @Override
+   public Trie getTrieRoot() {
+     return filteredTrie;
+   }
+ 
+   /**
+    * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
+    * current span, but whether the grammar is permitted to apply rules to the current span (a
+    * grammar-level parameter). As such we can just chain to the underlying grammar.
+    */
+   @Override
+   public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
+     return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
+   }
+ 
+   @Override
+   public int getNumRules() {
+     return getNumRules(getTrieRoot());
+   }
+ 
+   /**
+    * A convenience function that counts the number of rules in a grammar's trie.
+    * 
+    * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
+    * @return the number of rules
+    */
+   public int getNumRules(Trie node) {
+     int numRules = 0;
+     if (node != null) {
+       if (node.getRuleCollection() != null)
+         numRules += node.getRuleCollection().getRules().size();
+ 
+       if (node.getExtensions() != null)
+         for (Trie child : node.getExtensions())
+           numRules += getNumRules(child);
+     }
+ 
+     return numRules;
+   }
+ 
+   /**
+    * What is the algorithm?
+    * 
+    * Take the first word of the sentence, and start at the root of the trie. There are two things to
+    * consider: (a) word matches and (b) nonterminal matches.
+    * 
+    * For a word match, simply follow that arc along the trie. We create a parallel arc in our
+    * filtered grammar to represent it. Each arc in the filtered trie knows about its
+    * corresponding/underlying node in the unfiltered grammar trie.
+    * 
+    * A nonterminal is always permitted to match. The question then is how much of the input sentence
+    * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
+    * has to be a set of calls, one each to the next trie node with different lengths of the sentence
+    * remaining.
+    * 
+    * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
+    * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
+    * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
+    * subsequence, because with respect to filtering rules, they are all the same.
+    * 
+    * We accomplish this with the following restriction: for purposes of grammar filtering, only the
+    * first in a sequence of nonterminal traversals can consume more than one word. Each of the
+    * subsequent ones would have to consume just one word. We then just have to record in the
+    * recursive call whether the last traversal was a nonterminal or not.
+    * 
+    * @param unfilteredTrieRoot todo
+    * @return the root of the filtered trie
+    */
+   private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
+     SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
+ 
+     // System.err.println(String.format("FILTERING TO SENTENCE\n  %s\n",
+     // Vocabulary.getWords(tokens)));
+ 
+     /*
+      * The root of the trie is where rule applications start, so we simply try all possible
+      * positions in the sentence.
+      */
+     for (int i = 0; i < tokens.length; i++) {
+       filter(i, filteredTrieRoot, false);
+     }
+ 
+     return filteredTrieRoot;
+   }
+ 
+   /**
+    * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
+    * Marks arcs that are traversable for this sentence.
+    * 
+    * @param i the position in the sentence to start matching
+    * @param trie the trie node to match against
+    * @param lastWasNT true if the match that brought us here was against a nonterminal
+    */
+   private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
+     if (i >= tokens.length)
+       return;
+ 
+     /* Make sure the underlying unfiltered node has children. */
+     Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
+     if (unfilteredTrieNode.getChildren() == null) {
+       // trieNode.path.retreat();
+       return;
+     }
+ 
+     /* Match a word */
+     Trie trie = unfilteredTrieNode.match(tokens[i]);
+     if (trie != null) {
+       /*
+        * The current filtered node might already have an arc for this label. If so, retrieve it
+        * (since we still need to follow it); if not, create it.
+        */
+       SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
+       if (nextFilteredTrie == null) {
+         nextFilteredTrie = new SentenceFilteredTrie(trie);
+         trieNode.children.put(tokens[i], nextFilteredTrie);
+       }
+ 
+       /*
+        * Now continue, trying to match the child node against the next position in the sentence. The
+        * third argument records that this match was not against a nonterminal.
+        */
+       filter(i + 1, nextFilteredTrie, false);
+     }
+ 
+     /*
+      * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
+      * sentence, up to the maximum span for that grammar. So we enumerate all children of the
+      * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
+      * less than 0), then recurse.
+      * 
+      * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
+      * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
+      * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
+      * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
+      * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
+      */
+     HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
+     if (children != null) {
+       for (int label : children.keySet()) {
+         if (label < 0) {
+           SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
+           if (nextFilteredTrie == null) {
+             nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
+             trieNode.children.put(label, nextFilteredTrie);
+           }
+ 
+           /*
+            * Recurse. If the last match was a nonterminal, we can only consume one more token.
+            * 
+            * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
+            * span limit which should be consulted. What we should be doing is passing the point
+            * where we started matching the current sentence, so we can apply this span limit, which
+            * is easily accessible (baseGrammar.spanLimit).
+            */
+           int maxJ = lastWasNT ? (i + 1) : tokens.length;
+           for (int j = i + 1; j <= maxJ; j++) {
+             filter(j, nextFilteredTrie, true);
+           }
+         }
+       }
+     }
+   }
+ 
+   /**
+    * Alternate filter that uses regular expressions, walking the grammar trie and matching the
+    * source side of each rule collection against the input sentence. Failed matches are discarded,
+    * and trie nodes extending from that position need not be explored.
+    * 
+    * @param unfilteredTrie todo
+    * @return the root of the filtered trie if any rules were retained, otherwise null
+    */
+   @SuppressWarnings("unused")
+   private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
+     SentenceFilteredTrie trie = null;
+ 
+     /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
+     if (unfilteredTrie.hasRules())
+       if (matchesSentence(unfilteredTrie))
+         trie = new SentenceFilteredTrie(unfilteredTrie);
+       else
+         return null;
+ 
+     /* Case 2: keep the trie node if it has children who have valid rule collections */
+     if (unfilteredTrie.hasExtensions())
+       for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
+         Trie unfilteredChildTrie = arc.getValue();
+         SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
+         if (nextTrie != null) {
+           if (trie == null)
+             trie = new SentenceFilteredTrie(unfilteredTrie);
+           trie.children.put(arc.getKey(), nextTrie);
+         }
+       }
+ 
+     return trie;
+   }
+ 
+   private boolean matchesSentence(Trie childTrie) {
+     Rule rule = childTrie.getRuleCollection().getRules().get(0);
+     return rule.matches(sentence);
+   }
+ 
+   /**
+    * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
+    * the given input sentence.
+    * 
+    * @author Matt Post post@cs.jhu.edu
+    * 
+    */
+   public class SentenceFilteredTrie implements Trie {
+ 
+     /* The underlying unfiltered trie node. */
+     private Trie unfilteredTrieNode;
+ 
+     /* The child nodes in the filtered trie. */
+     private HashMap<Integer, SentenceFilteredTrie> children = null;
+ 
+     /**
+      * Constructor.
+      * 
+      * @param unfilteredTrieNode todo
+      */
+     public SentenceFilteredTrie(Trie unfilteredTrieNode) {
+       this.unfilteredTrieNode = unfilteredTrieNode;
+       this.children = new HashMap<Integer, SentenceFilteredTrie>();
+     }
+ 
+     @Override
+     public SentenceFilteredTrie match(int wordID) {
+       if (children != null)
+         return children.get(wordID);
+       return null;
+     }
+ 
+     @Override
+     public boolean hasExtensions() {
+       return children != null;
+     }
+ 
+     @Override
+     public Collection<SentenceFilteredTrie> getExtensions() {
+       if (children != null)
+         return children.values();
+ 
+       return null;
+     }
+ 
+     @Override
+     public HashMap<Integer, SentenceFilteredTrie> getChildren() {
+       return children;
+     }
+ 
+     @Override
+     public boolean hasRules() {
+       // Chain to the underlying unfiltered node.
+       return unfilteredTrieNode.hasRules();
+     }
+ 
+     @Override
+     public RuleCollection getRuleCollection() {
+       // Chain to the underlying unfiltered node, since the rule collection just varies by target
+       // side.
+       return unfilteredTrieNode.getRuleCollection();
+     }
+ 
+     /**
+      * Counts the number of rules.
+      * 
+      * @return the number of rules rooted at this node.
+      */
+     public int getNumRules() {
+       int numRules = 0;
+       if (getTrieRoot() != null)
+         if (getTrieRoot().getRuleCollection() != null)
+           numRules += getTrieRoot().getRuleCollection().getRules().size();
+ 
+       for (SentenceFilteredTrie node : getExtensions())
+         numRules += node.getNumRules();
+ 
+       return numRules;
+     }
+ 
+     @Override
+     public Iterator<Integer> getTerminalExtensionIterator() {
+       return new ExtensionIterator(children, true);
+     }
+ 
+     @Override
+     public Iterator<Integer> getNonterminalExtensionIterator() {
+       return new ExtensionIterator(children, false);
+     }
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index 0000000,f346e7a..365102b
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@@ -1,0 -1,292 +1,279 @@@
+ /*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *  http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+ package org.apache.joshua.decoder.ff.tm.hash_based;
+ 
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.HashMap;
+ import java.util.List;
+ 
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.ff.tm.GrammarReader;
+ import org.apache.joshua.decoder.ff.tm.Trie;
+ import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+ import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+ import org.apache.joshua.util.FormatUtils;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+ /**
+  * This class implements a memory-based bilingual BatchGrammar.
+  * <p>
+  * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+  * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+  * in HashMap
+  * 
+  * @author Zhifei Li zhifei.work@gmail.com
+  * @author Matt Post post@cs.jhu.edu
+  */
+ public class MemoryBasedBatchGrammar extends AbstractGrammar {
+ 
+   private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
+ 
+   // ===============================================================
+   // Instance Fields
+   // ===============================================================
+ 
+   /* The number of rules read. */
+   private int qtyRulesRead = 0;
+ 
+   /* The number of distinct source sides. */
+   private int qtyRuleBins = 0;
+ 
+   private int numDenseFeatures = 0;
+ 
+   /* The trie root. */
 -  private MemoryBasedTrie root = null;
++  private MemoryBasedTrie root = new MemoryBasedTrie();
+ 
+   /* The file containing the grammar. */
+   private String grammarFile;
+ 
+   private GrammarReader<Rule> modelReader;
+ 
+   // ===============================================================
+   // Static Fields
+   // ===============================================================
+ 
+   // ===============================================================
+   // Constructors
+   // ===============================================================
+ 
 -  public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
 -    super(joshuaConfiguration);
 -    this.root = new MemoryBasedTrie();
 -    this.joshuaConfiguration = joshuaConfiguration;
 -    setSpanLimit(20);
 -  }
 -
 -  public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
 -    this(joshuaConfiguration);
 -    this.owner = Vocabulary.id(owner);
++  /**
++   * Constructor used by Decoder mostly. Default spanLimit of 20
++   */
++  public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration config, int spanLimit) {
++    super(owner, config, spanLimit);
+   }
+ 
 -  public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
 -    // this.defaultOwner = Vocabulary.id(defaultOwner);
 -    // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
 -    this(joshuaConfiguration);
 -    modelReader = gr;
++  /**
++   * Constructor to initialize a GrammarReader (unowned)
++   */
++  public MemoryBasedBatchGrammar(
++      final GrammarReader<Rule> reader, final JoshuaConfiguration config, final int spanLimit) {
++    super(OwnerMap.UNKNOWN_OWNER, config, spanLimit);
++    modelReader = reader;
+   }
+ 
+   public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
+       String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
+       throws IOException {
+ 
 -    this(joshuaConfiguration);
 -    this.owner = Vocabulary.id(owner);
++    super(owner, joshuaConfiguration, spanLimit);
+     Vocabulary.id(defaultLHSSymbol);
 -    this.spanLimit = spanLimit;
+     this.grammarFile = grammarFile;
+ 
+     // ==== loading grammar
+     this.modelReader = createReader(formatKeyword, grammarFile);
+     if (modelReader != null) {
+       for (Rule rule : modelReader)
+         if (rule != null) {
+           addRule(rule);
+         }
+     } else {
+       LOG.info("Couldn't create a GrammarReader for file {} with format {}",
+           grammarFile, formatKeyword);
+     }
+ 
+     this.printGrammar();
+   }
+ 
+   protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
+ 
+     if (grammarFile != null) {
+       if ("hiero".equals(format) || "thrax".equals(format)) {
+         return new HieroFormatReader(grammarFile);
+       } else if ("moses".equals(format)) {
+         return new MosesFormatReader(grammarFile);
+       } else {
+         throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
+       }
+     }
+     return null;
+   }
+ 
+   // ===============================================================
+   // Methods
+   // ===============================================================
+ 
 -  public void setSpanLimit(int spanLimit) {
 -    this.spanLimit = spanLimit;
 -  }
 -
+   @Override
+   public int getNumRules() {
+     return this.qtyRulesRead;
+   }
+ 
+   /**
+    * if the span covered by the chart bin is greater than the limit, then return false
+    */
+   public boolean hasRuleForSpan(int i, int j, int pathLength) {
+     if (this.spanLimit == -1) { // mono-glue grammar
+       return (i == 0);
+     } else {
+       // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+       // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+       return (pathLength <= this.spanLimit);
+     }
+   }
+ 
+   public Trie getTrieRoot() {
+     return this.root;
+   }
+ 
+   /**
+    * Adds a rule to the grammar.
+    */
+   public void addRule(Rule rule) {
+ 
 -    // TODO: Why two increments?
+     this.qtyRulesRead++;
+ 
 -    // if (owner == -1) {
 -    // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
 -    // System.exit(1);
 -    // }
+     rule.setOwner(owner);
+ 
+     if (numDenseFeatures == 0)
+       numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
+ 
+     // === identify the position, and insert the trie nodes as necessary
+     MemoryBasedTrie pos = root;
+     int[] french = rule.getFrench();
+ 
+     maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+ 
+     for (int k = 0; k < french.length; k++) {
+       int curSymID = french[k];
+ 
+       /*
+        * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+        * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+        * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+        * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+        * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+        */
+ 
+       MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+       if (null == nextLayer) {
+         nextLayer = new MemoryBasedTrie();
+         if (pos.hasExtensions() == false) {
+           pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
+         }
+         pos.childrenTbl.put(curSymID, nextLayer);
+       }
+       pos = nextLayer;
+     }
+ 
+     // === add the rule into the trie node
+     if (!pos.hasRules()) {
+       pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
+       this.qtyRuleBins++;
+     }
+     pos.ruleBin.addRule(rule);
+   }
+ 
+   protected void printGrammar() {
+     LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+         this.qtyRulesRead, this.qtyRuleBins, grammarFile);
+   }
+ 
+   /***
+    * Takes an input word and creates an OOV rule in the current grammar for that word.
+    * 
+    * @param sourceWord integer representation of word
+    * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+    */
+   @Override
+   public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
+ 
+     // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+     // almost
+     // certainly is)
+     final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
+         .word(sourceWord) + "_OOV") : sourceWord;
+ 
+     int[] sourceWords = { sourceWord };
+     int[] targetWords = { targetWord };
+     final String oovAlignment = "0-0";
+ 
+     if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
+       for (OOVItem item : this.joshuaConfiguration.oovList) {
+         Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
+             oovAlignment);
+         addRule(oovRule);
+         oovRule.estimateRuleCost(featureFunctions);
+       }
+     } else {
+       int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
+       Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
+       addRule(oovRule);
+       oovRule.estimateRuleCost(featureFunctions);
+     }
+   }
+ 
+   /**
+    * Adds a default set of glue rules.
+    * 
+    * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+    */
+   public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
+     HieroFormatReader reader = new HieroFormatReader();
+ 
+     String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+     String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+ 
+     String[] ruleStrings = new String[] {
+         String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+             Vocabulary.START_SYM),
+         String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+             goalNT, defaultNT),
+         String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+             Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+ 
+     for (String ruleString : ruleStrings) {
+       Rule rule = reader.parseLine(ruleString);
+       addRule(rule);
+       rule.estimateRuleCost(featureFunctions);
+     }
+   }
+ 
+   @Override
+   public int getNumDenseFeatures() {
+     return numDenseFeatures;
+   }
+ }