You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:49 UTC
[38/60] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
new file mode 100644
index 0000000..ccfff46
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.joshua.corpus.Vocabulary;
+import  org.apache.joshua.decoder.ff.lm.AbstractLM;
+import  org.apache.joshua.decoder.ff.lm.ArpaFile;
+import  org.apache.joshua.decoder.ff.lm.ArpaNgram;
+import  org.apache.joshua.util.Bits;
+import  org.apache.joshua.util.Regex;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Relatively memory-compact language model
+ * stored as a reversed-word-order trie.
+ * <p>
+ * The trie itself represents language model context.
+ * <p>
+ * Conceptually, each node in the trie stores a map 
+ * from conditioning word to log probability.
+ * <p>
+ * Additionally, each node in the trie stores 
+ * the backoff weight for that context.
+ * 
+ * @author Lane Schwartz
+ * @see <a href="http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html">SRILM ngram-discount documentation</a>
+ */
+public class TrieLM extends AbstractLM { //DefaultNGramLanguageModel {
+
+  private static final Logger LOG = LoggerFactory.getLogger(TrieLM.class);
+
+  /**
+   * Node ID for the root node.
+   */
+  private static final int ROOT_NODE_ID = 0;
+
+
+  /** 
+   * Maps from (node id, word id for child) --> node id of child. 
+   */
+  private final Map<Long,Integer> children;
+
+  /**
+   * Maps from (node id, word id for lookup word) --> 
+   * log prob of lookup word given context 
+   * 
+   * (the context is defined by where you are in the tree).
+   */
+  private final Map<Long,Float> logProbs;
+
+  /**
+   * Maps from (node id) --> 
+   * backoff weight for that context 
+   * 
+   * (the context is defined by where you are in the tree).
+   */
+  private final Map<Integer,Float> backoffs;
+
+  public TrieLM(Vocabulary vocab, String file) throws FileNotFoundException {
+    this(new ArpaFile(file,vocab));
+  }
+
+  /**
+   * Constructs a language model object from the specified ARPA file.
+   * 
+   * @param arpaFile input ARPA file
+   * @throws FileNotFoundException if the input file cannot be located
+   */
+  public TrieLM(ArpaFile arpaFile) throws FileNotFoundException {
+    super(arpaFile.getVocab().size(), arpaFile.getOrder());
+
+    int ngramCounts = arpaFile.size();
+    LOG.debug("ARPA file contains {} n-grams", ngramCounts);
+
+    this.children = new HashMap<Long,Integer>(ngramCounts);
+    this.logProbs = new HashMap<Long,Float>(ngramCounts);
+    this.backoffs = new HashMap<Integer,Float>(ngramCounts);
+
+    int nodeCounter = 0;
+
+    int lineNumber = 0;
+    for (ArpaNgram ngram : arpaFile) {
+      lineNumber += 1;
+      if (lineNumber % 100000 == 0){
+        LOG.info("Line: {}", lineNumber);
+      }
+
+      LOG.debug("{}-gram: ({} | {})", ngram.order(), ngram.getWord(),
+          Arrays.toString(ngram.getContext()));
+      int word = ngram.getWord();
+
+      int[] context = ngram.getContext();
+
+      {
+        // Find where the log prob should be stored
+        int contextNodeID = ROOT_NODE_ID;
+        {
+          for (int i=context.length-1; i>=0; i--) {
+            long key = Bits.encodeAsLong(contextNodeID, context[i]);
+            int childID;
+            if (children.containsKey(key)) {
+              childID = children.get(key);
+            } else {
+              childID = ++nodeCounter;
+              LOG.debug("children.put({}:{}, {})", contextNodeID, context[i], childID);
+              children.put(key, childID);
+            }
+            contextNodeID = childID;
+          }
+        }
+
+        // Store the log prob for this n-gram at this node in the trie
+        {
+          long key = Bits.encodeAsLong(contextNodeID, word);
+          float logProb = ngram.getValue();
+          LOG.debug("logProbs.put({}:{}, {}", contextNodeID, word, logProb);
+          this.logProbs.put(key, logProb);
+        }
+      }
+
+      {
+        // Find where the backoff should be stored
+        int backoffNodeID = ROOT_NODE_ID;
+        { 
+          long backoffNodeKey = Bits.encodeAsLong(backoffNodeID, word);
+          int wordChildID;
+          if (children.containsKey(backoffNodeKey)) {
+            wordChildID = children.get(backoffNodeKey);
+          } else {
+            wordChildID = ++nodeCounter;
+            LOG.debug("children.put({}: {}, {})", backoffNodeID, word, wordChildID);
+            children.put(backoffNodeKey, wordChildID);
+          }
+          backoffNodeID = wordChildID;
+
+          for (int i=context.length-1; i>=0; i--) {
+            long key = Bits.encodeAsLong(backoffNodeID, context[i]);
+            int childID;
+            if (children.containsKey(key)) {
+              childID = children.get(key);
+            } else {
+              childID = ++nodeCounter;
+              LOG.debug("children.put({}:{}, {})", backoffNodeID, context[i], childID);
+              children.put(key, childID);
+            }
+            backoffNodeID = childID;
+          }
+        }
+
+        // Store the backoff for this n-gram at this node in the trie
+        {
+          float backoff = ngram.getBackoff();
+          LOG.debug("backoffs.put({}:{}, {})", backoffNodeID, word, backoff);
+          this.backoffs.put(backoffNodeID, backoff);
+        }
+      }
+
+    }
+  }
+
+
+  @Override
+  protected double logProbabilityOfBackoffState_helper(
+      int[] ngram, int order, int qtyAdditionalBackoffWeight
+      ) {
+    throw new UnsupportedOperationException("probabilityOfBackoffState_helper undefined for TrieLM");
+  }
+
+  @Override
+  protected float ngramLogProbability_helper(int[] ngram, int order) {
+
+//    float logProb = (float) -JoshuaConfiguration.lm_ceiling_cost;//Float.NEGATIVE_INFINITY; // log(0.0f)
+    float backoff = 0.0f; // log(1.0f)
+
+    int i = ngram.length - 1;
+    int word = ngram[i];
+    i -= 1;
+
+    int nodeID = ROOT_NODE_ID;
+
+    while (true) {
+
+      {
+        long key = Bits.encodeAsLong(nodeID, word);
+        if (logProbs.containsKey(key)) {
+//          logProb = logProbs.get(key);
+          backoff = 0.0f; // log(0.0f)
+        }
+      }
+
+      if (i < 0) {
+        break;
+      }
+
+      {
+        long key = Bits.encodeAsLong(nodeID, ngram[i]);
+
+        if (children.containsKey(key)) {
+          nodeID = children.get(key);
+
+          backoff += backoffs.get(nodeID);
+
+          i -= 1;
+
+        } else {
+          break;
+        }
+      }
+
+    }
+
+//    double result = logProb + backoff;
+//    if (result < -JoshuaConfiguration.lm_ceiling_cost) {
+//      result = -JoshuaConfiguration.lm_ceiling_cost;
+//    }
+//
+//    return result;
+    return (Float) null;
+  }
+
+  public Map<Long,Integer> getChildren() {
+    return this.children;
+  }
+
+  public static void main(String[] args) throws IOException {
+
+    LOG.info("Constructing ARPA file");
+    ArpaFile arpaFile = new ArpaFile(args[0]);
+
+    LOG.info("Getting symbol table");
+    Vocabulary vocab = arpaFile.getVocab();
+
+    LOG.info("Constructing TrieLM");
+    TrieLM lm = new TrieLM(arpaFile);
+
+    int n = Integer.valueOf(args[2]);
+    LOG.info("N-gram order will be {}", n);
+
+    Scanner scanner = new Scanner(new File(args[1]));
+
+    LinkedList<String> wordList = new LinkedList<String>();
+    LinkedList<String> window = new LinkedList<String>();
+
+    LOG.info("Starting to scan {}", args[1]);
+    while (scanner.hasNext()) {
+
+      LOG.info("Getting next line...");
+      String line = scanner.nextLine();
+      LOG.info("Line: {}", line);
+
+      String[] words = Regex.spaces.split(line);
+      wordList.clear();
+
+      wordList.add("<s>");
+      for (String word : words) {
+        wordList.add(word);
+      }
+      wordList.add("</s>");
+
+      ArrayList<Integer> sentence = new ArrayList<Integer>();
+      //        int[] ids = new int[wordList.size()];
+      for (int i=0, size=wordList.size(); i<size; i++) {
+        sentence.add(vocab.id(wordList.get(i)));
+        //          ids[i] = ;
+      }
+
+
+
+      while (! wordList.isEmpty()) {
+        window.clear();
+
+        {
+          int i=0;
+          for (String word : wordList) {
+            if (i>=n) break;
+            window.add(word);
+            i++;
+          }
+          wordList.remove();
+        }
+
+        {
+          int i=0;
+          int[] wordIDs = new int[window.size()];
+          for (String word : window) {
+            wordIDs[i] = vocab.id(word);
+            i++;
+          }
+
+          LOG.info("logProb {} = {}", window, lm.ngramLogProbability(wordIDs, n));
+        }
+      }
+
+      double logProb = lm.sentenceLogProbability(sentence, n, 2);//.ngramLogProbability(ids, n);
+      double prob = Math.exp(logProb);
+
+      LOG.info("Total logProb = {}", logProb);
+      LOG.info("Total    prob = {}",  prob);
+    }
+
+  }
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
new file mode 100644
index 0000000..6c84703
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
new file mode 100644
index 0000000..22da71e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * <p>Provides abstraction and support for the language model 
+ * feature function typically used in hierarchical phrase-based 
+ * decoding for statistical machine translation.</p>
+ * <p>The classes contained within this directory are 
+ * responsible for two tasks: implementing the feature function, 
+ * and representing the language model itself.  The class 
+ * `LanguageModelFF` implements the feature function by exending 
+ * the class `DefaultStatefulFF`.  One of these is instantiated 
+ * for each language model present in the decoder.</p>
+ * <p>The language models themselves are implemented as a 
+ * combination of an interface (`NGramLanguageModel`), a default 
+ * implementation (`DefaultNgramLangaugeModel`), and an abstract
+ * implementation of the default (`AbstractLM`).</p>
+ *
+ * <pre>
+ *  DefaultStatefulFF
+ *  |- LanguageModelFF
+ *
+ *  DefaultNgramLanguageModel implements interface NGramLanguageModel
+ *  |- AbstractLM
+ * </pre>
+ */
+package org.apache.joshua.decoder.ff.lm;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
new file mode 100644
index 0000000..b0af73e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/** 
+ * <p>Provides an implementation of the linear feature functions 
+ * typically used in hierarchical phrase-based decoding for 
+ * statistical machine translation.</p>
+ * <p>The following is a note from Juri describing some of the 
+ * functionality of the feature functions interfaces and default 
+ * abstract classes.</p>
+ * <pre>
+ * The equality that I intended for is ff.transitionLogP() =
+ * ff.estimateLogP() + ff.reEstimateTransitionLogP(). The re-estimate
+ * fixes the estimate to be the true transition cost that takes into
+ * account the state. Before decoding the cost of applying a rule is
+ * estimated via estimateLogP() and yields the phrasal feature costs plus
+ * an LM estimate of the cost of the lexical portions of the rule.
+ * transitionLogP() takes rule and state and computes everything from
+ * scratch, whereas reEstimateTransitionLogP() adds in the cost of new
+ * n-grams that result from combining the rule with the LM states and
+ * subtracts out the cost of superfluous less-than-n-grams that were
+ * overridden by the updated cost calculation.
+ * 
+ * Hope this helps.
+ * </pre>
+ */
+package org.apache.joshua.decoder.ff;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
new file mode 100644
index 0000000..f9e6a29
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.phrase;
+
+import java.util.ArrayList;
+import java.util.List;	
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.StatelessFF;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+public class Distortion extends StatelessFF {
+
+  public Distortion(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+    super(weights, "Distortion", args, config);
+    
+    if (! config.search_algorithm.equals("stack")) {
+      String msg = "* FATAL: Distortion feature only application for phrase-based decoding. "
+          + "Use -search phrase or remove this feature";
+      throw new RuntimeException(msg);
+    }
+  }
+  
+  @Override
+  public ArrayList<String> reportDenseFeatures(int index) {
+    denseFeatureIndex = index;
+    
+    ArrayList<String> names = new ArrayList<String>();
+    names.add(name);
+    return names;
+  }
+
+  @Override
+  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+      Sentence sentence, Accumulator acc) {
+
+    if (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE) {
+        int start_point = j - rule.getFrench().length + rule.getArity();
+
+        int jump_size = Math.abs(tailNodes.get(0).j - start_point);
+//        acc.add(name, -jump_size);
+        acc.add(denseFeatureIndex, -jump_size); 
+    }
+    
+//    System.err.println(String.format("DISTORTION(%d, %d) from %d = %d", i, j, tailNodes != null ? tailNodes.get(0).j : -1, jump_size));
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
new file mode 100644
index 0000000..91af58b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.similarity;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.Socket;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.base.Throwables;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.SourceDependentFF;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.Cache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependentFF {
+
+  private static final Logger LOG = LoggerFactory.getLogger(EdgePhraseSimilarityFF.class);
+
+  private static Cache<String, Float> cache = new Cache<String, Float>(100000000);
+
+  private String host;
+  private int port;
+
+  private Socket socket;
+  private PrintWriter serverAsk;
+  private BufferedReader serverReply;
+
+  private int[] source;
+
+  private final int MAX_PHRASE_LENGTH = 4;
+  private final int GAP = 0;
+
+  public EdgePhraseSimilarityFF(FeatureVector weights, String[] args, JoshuaConfiguration config) throws NumberFormatException, UnknownHostException, IOException {
+    super(weights, "EdgePhraseSimilarity", args, config);
+
+    this.host = parsedArgs.get("host");
+    this.port = Integer.parseInt(parsedArgs.get("port"));
+
+    initializeConnection();
+  }
+
+  private void initializeConnection() throws NumberFormatException, IOException {
+    LOG.info("Opening connection.");
+    socket = new Socket(host, port);
+    serverAsk = new PrintWriter(socket.getOutputStream(), true);
+    serverReply = new BufferedReader(new InputStreamReader(socket.getInputStream()));
+  }
+
+  @Override
+  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+      Sentence sentence, Accumulator acc) {
+
+    float value = computeScore(rule, tailNodes);
+    acc.add(name, value);
+
+    // TODO 07/2013: EdgePhraseSimilarity needs to know its order rather than inferring it from tail
+    // nodes.
+    return new NgramDPState(new int[1], new int[1]);
+  }
+  
+  @Override
+  public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath path, Sentence sentence, Accumulator acc) {
+    return null;
+  }
+
+  public float computeScore(Rule rule, List<HGNode> tailNodes) {
+    if (tailNodes == null || tailNodes.isEmpty())
+      return 0;
+
+    // System.err.println("RULE [" + spanStart + ", " + spanEnd + "]: " + rule.toString());
+
+    int[] target = rule.getEnglish();
+    int lm_state_size = 0;
+    for (HGNode node : tailNodes) {
+      NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
+      lm_state_size += state.getLeftLMStateWords().length + state.getRightLMStateWords().length;
+    }
+
+    ArrayList<int[]> batch = new ArrayList<int[]>();
+
+    // Build joined target string.
+    int[] join = new int[target.length + lm_state_size];
+
+    int idx = 0, num_gaps = 1, num_anchors = 0;
+    int[] anchors = new int[rule.getArity() * 2];
+    int[] indices = new int[rule.getArity() * 2];
+    int[] gaps = new int[rule.getArity() + 2];
+    gaps[0] = 0;
+    for (int t = 0; t < target.length; t++) {
+      if (target[t] < 0) {
+        HGNode node = tailNodes.get(-(target[t] + 1));
+        if (t != 0) {
+          indices[num_anchors] = node.i;
+          anchors[num_anchors++] = idx;
+        }
+        NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
+        // System.err.print("LEFT:  ");
+        // for (int w : state.getLeftLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
+        // System.err.println();
+        for (int w : state.getLeftLMStateWords())
+          join[idx++] = w;
+        join[idx++] = GAP;
+        gaps[num_gaps++] = idx;
+        // System.err.print("RIGHT:  ");
+        // for (int w : state.getRightLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
+        // System.err.println();
+        for (int w : state.getRightLMStateWords())
+          join[idx++] = w;
+        if (t != target.length - 1) {
+          indices[num_anchors] = node.j;
+          anchors[num_anchors++] = idx;
+        }
+      } else {
+        join[idx++] = target[t];
+      }
+    }
+    gaps[gaps.length - 1] = join.length + 1;
+
+    // int c = 0;
+    // System.err.print("> ");
+    // for (int k = 0; k < join.length; k++) {
+    // if (c < num_anchors && anchors[c] == k) {
+    // c++;
+    // System.err.print("| ");
+    // }
+    // System.err.print(Vocabulary.word(join[k]) + " ");
+    // }
+    // System.err.println("<");
+
+    int g = 0;
+    for (int a = 0; a < num_anchors; a++) {
+      if (a > 0 && anchors[a - 1] == anchors[a])
+        continue;
+      if (anchors[a] > gaps[g + 1])
+        g++;
+      int left = Math.max(gaps[g], anchors[a] - MAX_PHRASE_LENGTH + 1);
+      int right = Math.min(gaps[g + 1] - 1, anchors[a] + MAX_PHRASE_LENGTH - 1);
+
+      int[] target_phrase = new int[right - left];
+      System.arraycopy(join, left, target_phrase, 0, target_phrase.length);
+      int[] source_phrase = getSourcePhrase(indices[a]);
+
+      if (source_phrase != null && target_phrase.length != 0) {
+        // System.err.println("ANCHOR: " + indices[a]);
+        batch.add(source_phrase);
+        batch.add(target_phrase);
+      }
+    }
+    return getSimilarity(batch);
+  }
+
+  @Override
+  public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+    return 0.0f;
+  }
+
+  /**
+   * From SourceDependentFF interface.
+   */
+  @Override
+  public void setSource(Sentence sentence) {
+    if (! sentence.isLinearChain())
+      throw new RuntimeException("EdgePhraseSimilarity not defined for lattices");
+    this.source = sentence.getWordIDs();
+  }
+
+  public EdgePhraseSimilarityFF clone() {
+    try {
+      return new EdgePhraseSimilarityFF(this.weights, args, config);
+    } catch (Exception e) {
+      throw Throwables.propagate(e);
+    }
+  }
+
+  @Override
+  public float estimateCost(Rule rule, Sentence sentence) {
+    return 0.0f;
+  }
+
+  private final int[] getSourcePhrase(int anchor) {
+    int idx;
+    int length = Math.min(anchor, MAX_PHRASE_LENGTH - 1)
+        + Math.min(source.length - anchor, MAX_PHRASE_LENGTH - 1);
+    if (length <= 0)
+      return null;
+    int[] phrase = new int[length];
+    idx = 0;
+    for (int p = Math.max(0, anchor - MAX_PHRASE_LENGTH + 1); p < Math.min(source.length, anchor
+        + MAX_PHRASE_LENGTH - 1); p++)
+      phrase[idx++] = source[p];
+    return phrase;
+  }
+
+  private float getSimilarity(List<int[]> batch) {
+    float similarity = 0.0f;
+    int count = 0;
+    StringBuilder query = new StringBuilder();
+    List<String> to_cache = new ArrayList<String>();
+    query.append("xb");
+    for (int i = 0; i < batch.size(); i += 2) {
+      int[] source = batch.get(i);
+      int[] target = batch.get(i + 1);
+
+      if (Arrays.equals(source, target)) {
+        similarity += 1;
+        count++;
+      } else {
+        String source_string = Vocabulary.getWords(source);
+        String target_string = Vocabulary.getWords(target);
+
+        String both;
+        if (source_string.compareTo(target_string) > 0)
+          both = source_string + " ||| " + target_string;
+        else
+          both = target_string + " ||| " + source_string;
+
+        Float cached = cache.get(both);
+        if (cached != null) {
+          // System.err.println("SIM: " + source_string + " X " + target_string + " = " + cached);
+          similarity += cached;
+          count++;
+        } else {
+          query.append("\t").append(source_string);
+          query.append("\t").append(target_string);
+          to_cache.add(both);
+        }
+      }
+    }
+    if (!to_cache.isEmpty()) {
+      try {
+        serverAsk.println(query.toString());
+        String response = serverReply.readLine();
+        String[] scores = response.split("\\s+");
+        for (int i = 0; i < scores.length; i++) {
+          Float score = Float.parseFloat(scores[i]);
+          cache.put(to_cache.get(i), score);
+          similarity += score;
+          count++;
+        }
+      } catch (Exception e) {
+        return 0;
+      }
+    }
+    return (count == 0 ? 0 : similarity / count);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
new file mode 100644
index 0000000..e117fde
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+/**
+ * Abstract class enforcing explicit implementation of the standard methods.
+ * 
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Juri Ganitkevitch, juri@cs.jhu.edu
+ */
+public abstract class DPState {
+
+  public abstract String toString();
+
+  public abstract int hashCode();
+
+  public abstract boolean equals(Object other);
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
new file mode 100644
index 0000000..4fdc631
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+/**
+ * Maintains a state pointer used by KenLM to implement left-state minimization. 
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ */
+public class KenLMState extends DPState {
+
+  private long state = 0;
+
+  public KenLMState() {
+  }
+
+  public KenLMState(long stateId) {
+    this.state = stateId;
+  }
+
+  public long getState() {
+    return state;
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) ((getState() >> 32) ^ getState());
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return (other instanceof KenLMState && this.getState() == ((KenLMState) other).getState());
+  }
+
+  @Override
+  public String toString() {
+    return String.format("[KenLMState %d]", getState());
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
new file mode 100644
index 0000000..b269bd9
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+import java.util.Arrays;
+
+import org.apache.joshua.corpus.Vocabulary;
+
+/**
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Juri Ganitkevitch, juri@cs.jhu.edu
+ */
+public class NgramDPState extends DPState {
+
+  private int[] left;
+  private int[] right;
+
+  private int hash = 0;
+
+  public NgramDPState(int[] l, int[] r) {
+    left = l;
+    right = r;
+    assertLengths();
+  }
+
+  public void setLeftLMStateWords(int[] words) {
+    left = words;
+    assertLengths();
+  }
+
+  public int[] getLeftLMStateWords() {
+    return left;
+  }
+
+  public void setRightLMStateWords(int[] words) {
+    right = words;
+    assertLengths();
+  }
+
+  public int[] getRightLMStateWords() {
+    return right;
+  }
+
+  private final void assertLengths() {
+    if (left.length != right.length)
+      throw new RuntimeException("Unequal lengths in left and right state: < "
+          + Vocabulary.getWords(left) + " | " + Vocabulary.getWords(right) + " >");
+  }
+
+  @Override
+  public int hashCode() {
+    if (hash == 0) {
+      hash = 31 + Arrays.hashCode(left);
+      hash = hash * 19 + Arrays.hashCode(right);
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof NgramDPState) {
+      NgramDPState that = (NgramDPState) other;
+      if (this.left.length == that.left.length && this.right.length == that.right.length) {
+        for (int i = 0; i < left.length; ++i)
+          if (this.left[i] != that.left[i] || this.right[i] != that.right[i])
+            return false;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("<");
+    for (int id : left)
+      sb.append(" " + Vocabulary.word(id));
+    sb.append(" |");
+    for (int id : right)
+      sb.append(" " + Vocabulary.word(id));
+    sb.append(" >");
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
new file mode 100644
index 0000000..6c512bd
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.phrase.PhraseTable;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Lattice;
+import org.apache.joshua.lattice.Node;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
+ * grammar.
+ * <p>
+ * <em>Note</em>: New classes implementing the <code>Grammar</code> interface should probably
+ * inherit from this class, unless a specific sorting technique different from that implemented by
+ * this class is required.
+ *
+ * @author Zhifei Li
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ */
+public abstract class AbstractGrammar implements Grammar {
+
+  /** Logger for this class. */
+  private static final Logger LOG = LoggerFactory.getLogger(AbstractGrammar.class);
+  /**
+   * Indicates whether the rules in this grammar have been sorted based on the latest feature
+   * function values.
+   */
+  protected boolean sorted = false;
+
+  /*
+   * The grammar's owner, used to determine which weights are applicable to the dense features found
+   * within.
+   */
+  protected int owner = -1;
+
+  /*
+   * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
+   */
+  protected int maxSourcePhraseLength = -1;
+
+    /**
+   * Returns the longest source phrase read.
+   *
+   * @return the longest source phrase read (nonterminal + terminal symbols).
+   */
+  @Override
+  public int getMaxSourcePhraseLength() {
+    return maxSourcePhraseLength;
+  }
+
+  @Override
+  public int getOwner() {
+    return owner;
+  }
+
+  /* The maximum span of the input this rule can be applied to. */
+  protected int spanLimit = 1;
+
+  protected JoshuaConfiguration joshuaConfiguration;
+
+  /**
+   * Constructs an empty, unsorted grammar.
+   *
+   * @see Grammar#isSorted()
+   * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+   */
+  public AbstractGrammar(JoshuaConfiguration config) {
+    this.joshuaConfiguration = config;
+    this.sorted = false;
+  }
+
+  public AbstractGrammar(int owner, int spanLimit) {
+    this.sorted = false;
+    this.owner = owner;
+    this.spanLimit = spanLimit;
+  }
+
+  public static final int OOV_RULE_ID = 0;
+
+  /**
+   * Cube-pruning requires that the grammar be sorted based on the latest feature functions. To
+   * avoid synchronization, this method should be called before multiple threads are initialized for
+   * parallel decoding
+   * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   */
+  public void sortGrammar(List<FeatureFunction> models) {
+    Trie root = getTrieRoot();
+    if (root != null) {
+      sort(root, models);
+      setSorted(true);
+    }
+  }
+
+  /* See Javadoc comments for Grammar interface. */
+  public boolean isSorted() {
+    return sorted;
+  }
+
+  /**
+   * Sets the flag indicating whether this grammar is sorted.
+   * <p>
+   * This method is called by {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar#sortGrammar(List)}
+   * to indicate that the grammar has been sorted.</p>
+   *
+   * <p>Its scope is protected so that child classes that override <code>sortGrammar</code> will also
+   * be able to call this method to indicate that the grammar has been sorted.</p>
+   *
+   * @param sorted set to true if the grammar is sorted
+   */
+  protected void setSorted(boolean sorted) {
+    this.sorted = sorted;
+    LOG.debug("This grammar is now sorted: {}",  this);
+  }
+
+  /**
+   * Recursively sorts the grammar using the provided feature functions.
+   * <p>
+   * This method first sorts the rules stored at the provided node, then recursively calls itself on
+   * the child nodes of the provided node.
+   *
+   * @param node Grammar node in the <code>Trie</code> whose rules should be sorted.
+   * @param models Feature function models to use during sorting.
+   */
+  private void sort(Trie node, List<FeatureFunction> models) {
+
+    if (node != null) {
+      if (node.hasRules()) {
+        RuleCollection rules = node.getRuleCollection();
+        LOG.debug("Sorting node {}", Arrays.toString(rules.getSourceSide()));
+
+        /* This causes the rules at this trie node to be sorted */
+        rules.getSortedRules(models);
+
+        if (LOG.isDebugEnabled()) {
+          StringBuilder s = new StringBuilder();
+          for (Rule r : rules.getSortedRules(models)) {
+            s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| "
+                + Arrays.toString(r.getEnglish()) + " ||| " + r.getFeatureVector() + " ||| "
+                + r.getEstimatedCost() + "  " + r.getClass().getName() + "@"
+                + Integer.toHexString(System.identityHashCode(r)));
+          }
+          LOG.debug("{}", s);
+        }
+      }
+
+      if (node.hasExtensions()) {
+        for (Trie child : node.getExtensions()) {
+          sort(child, models);
+        }
+      } else {
+        LOG.debug("Node has 0 children to extend: {}", node);
+      }
+    }
+  }
+
+  // write grammar to disk
+  public void writeGrammarOnDisk(String file) {
+  }
+
+  /**
+   * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
+   * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
+   *
+   * @param grammar Grammar in the Trie
+   * @param inputLattice the lattice representing the input sentence
+   * @param featureFunctions a list of feature functions used for scoring
+   * @param onlyTrue determine if word is actual OOV.
+   */
+  public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice,
+      List<FeatureFunction> featureFunctions, boolean onlyTrue) {
+    /*
+     * Add OOV rules; This should be called after the manual constraints have
+     * been set up.
+     */
+    HashSet<Integer> words = new HashSet<Integer>();
+    for (Node<Token> node : inputLattice) {
+      for (Arc<Token> arc : node.getOutgoingArcs()) {
+        // create a rule, but do not add into the grammar trie
+        // TODO: which grammar should we use to create an OOV rule?
+        int sourceWord = arc.getLabel().getWord();
+        if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
+            || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
+          continue;
+
+        // Determine if word is actual OOV.
+        if (onlyTrue && ! Vocabulary.hasId(sourceWord))
+          continue;
+
+        words.add(sourceWord);
+      }
+    }
+
+    for (int sourceWord: words)
+      grammar.addOOVRules(sourceWord, featureFunctions);
+
+    // Sort all the rules (not much to actually do, this just marks it as sorted)
+    grammar.sortGrammar(featureFunctions);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
new file mode 100644
index 0000000..4cffb2f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * Basic collection of translation rules.
+ * 
+ * @author Lane Schwartz
+ * @author Zhifei Li
+ */
+public class BasicRuleCollection implements RuleCollection {
+
+  /**
+   * Indicates whether the rules in this collection have been sorted based on the latest feature
+   * function values.
+   */
+  protected boolean sorted;
+
+  /** List of rules stored in this collection. */
+  protected final List<Rule> rules;
+
+  /** Number of nonterminals in the source pattern. */
+  protected int arity;
+
+  /**
+   * Sequence of terminals and nonterminals in the source pattern.
+   */
+  protected int[] sourceTokens;
+
+  /**
+   * Constructs an initially empty rule collection.
+   * 
+   * @param arity Number of nonterminals in the source pattern
+   * @param sourceTokens Sequence of terminals and nonterminals in the source pattern
+   */
+  public BasicRuleCollection(int arity, int[] sourceTokens) {
+    this.rules = new ArrayList<Rule>();
+    this.sourceTokens = sourceTokens;
+    this.arity = arity;
+    this.sorted = false;
+  }
+
+  public int getArity() {
+    return this.arity;
+  }
+
+  /**
+   * Returns a list of the rules, without ensuring that they are first sorted.
+   */
+  @Override
+  public List<Rule> getRules() {
+    return this.rules;
+  }
+  
+  @Override
+  public boolean isSorted() {
+    return sorted;
+  }
+
+  /**
+   * Return a list of rules sorted according to their estimated model costs.
+   */
+  @Override
+  public synchronized List<Rule> getSortedRules(List<FeatureFunction> models) {
+    if (! isSorted()) {
+      for (Rule rule: getRules())
+        rule.estimateRuleCost(models);
+
+      Collections.sort(rules, Rule.EstimatedCostComparator);
+      this.sorted = true;      
+    }
+    
+    return this.rules;
+  }
+
+  public int[] getSourceSide() {
+    return this.sourceTokens;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
new file mode 100644
index 0000000..ce1e7d1
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
+import static org.apache.joshua.util.FormatUtils.isNonterminal;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.io.LineReader;
+
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CreateGlueGrammar {
+
+
+  private static final Logger LOG = LoggerFactory.getLogger(CreateGlueGrammar.class);
+
+  private final Set<String> nonTerminalSymbols = new HashSet<>();
+
+  @Option(name = "--grammar", aliases = {"-g"}, required = true, usage = "provide grammar to determine list of NonTerminal symbols.")
+  private String grammarPath;
+  
+  @Option(name = "--goal", aliases = {"-goal"}, required = false, usage = "specify custom GOAL symbol. Default: 'GOAL'")
+  private String goalSymbol = cleanNonTerminal(new JoshuaConfiguration().goal_symbol);
+
+  /* Rule templates */
+  // [GOAL] ||| <s> ||| <s> ||| 0
+  private static final String R_START = "[%1$s] ||| <s> ||| <s> ||| 0";
+  // [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+  private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1";
+  // [GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+  private static final String R_END = "[%1$s] ||| [%1$s,1] </s> ||| [%1$s,1] </s> ||| 0";
+  // [GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
+  private static final String R_TOP = "[%1$s] ||| <s> [%2$s,1] </s> ||| <s> [%2$s,1] </s> ||| 0";
+  
+  private void run() throws IOException {
+    
+    File grammar_file = new File(grammarPath);
+    if (!grammar_file.exists()) {
+      throw new IOException("Grammar file doesn't exist: " + grammarPath);
+    }
+
+    // in case of a packedGrammar, we read the serialized vocabulary,
+    // collecting all cleaned nonTerminal symbols.
+    if (grammar_file.isDirectory()) {
+      Vocabulary.read(new File(grammarPath + File.separator + VOCABULARY_FILENAME));
+      for (int i = 0; i < Vocabulary.size(); ++i) {
+        final String token = Vocabulary.word(i);
+        if (isNonterminal(token)) {
+          nonTerminalSymbols.add(cleanNonTerminal(token));
+        }
+      }
+    // otherwise we collect cleaned left-hand sides from the rules in the text grammar.
+    } else { 
+      final LineReader reader = new LineReader(grammarPath);
+      while (reader.hasNext()) {
+        final String line = reader.next();
+        int lhsStart = line.indexOf("[") + 1;
+        int lhsEnd = line.indexOf("]");
+        if (lhsStart < 1 || lhsEnd < 0) {
+          LOG.info("malformed rule: {}\n", line);
+          continue;
+        }
+        final String lhs = line.substring(lhsStart, lhsEnd);
+        nonTerminalSymbols.add(lhs);
+      }
+    }
+    
+    LOG.info("{} nonTerminal symbols read: {}", nonTerminalSymbols.size(),
+        nonTerminalSymbols.toString());
+
+    // write glue rules to stdout
+    
+    System.out.println(String.format(R_START, goalSymbol));
+    
+    for (String nt : nonTerminalSymbols)
+      System.out.println(String.format(R_TWO, goalSymbol, nt));
+    
+    System.out.println(String.format(R_END, goalSymbol));
+    
+    for (String nt : nonTerminalSymbols)
+      System.out.println(String.format(R_TOP, goalSymbol, nt));
+
+  }
+  
+  public static void main(String[] args) throws IOException {
+    final CreateGlueGrammar glueCreator = new CreateGlueGrammar();
+    final CmdLineParser parser = new CmdLineParser(glueCreator);
+
+    try {
+      parser.parseArgument(args);
+      glueCreator.run();
+    } catch (CmdLineException e) {
+      LOG.error(e.getMessage(), e);
+      parser.printUsage(System.err);
+      System.exit(1);
+    }
+   }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
new file mode 100644
index 0000000..06252a1
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public interface Grammar {
+
+  /**
+   * Gets the root of the <code>Trie</code> backing this grammar.
+   * <p>
+   * <em>Note</em>: This method should run as a small constant-time function.
+   * 
+   * @return the root of the <code>Trie</code> backing this grammar
+   */
+  Trie getTrieRoot();
+
+  /**
+   * After calling this method, the rules in this grammar are guaranteed to be sorted based on the
+   * latest feature function values.
+   * <p>
+   * Cube-pruning requires that the grammar be sorted based on the latest feature functions.
+   * 
+   * @param models list of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   */
+  void sortGrammar(List<FeatureFunction> models);
+
+  /**
+   * Determines whether the rules in this grammar have been sorted based on the latest feature
+   * function values.
+   * <p>
+   * This method is needed for the cube-pruning algorithm.
+   * 
+   * @return <code>true</code> if the rules in this grammar have been sorted based on the latest
+   *         feature function values, <code>false</code> otherwise
+   */
+  boolean isSorted();
+
+  /**
+   * Returns whether this grammar has any valid rules for covering a particular span of a sentence.
+   * Hiero's "glue" grammar will only say True if the span is longer than our span limit, and is
+   * anchored at startIndex==0. Hiero's "regular" grammar will only say True if the span is less
+   * than the span limit. Other grammars, e.g. for rule-based systems, may have different behaviors.
+   * 
+   * @param startIndex Indicates the starting index of a phrase in a source input phrase, or a
+   *          starting node identifier in a source input lattice
+   * @param endIndex Indicates the ending index of a phrase in a source input phrase, or an ending
+   *          node identifier in a source input lattice
+   * @param pathLength Length of the input path in a source input lattice. If a source input phrase
+   *          is used instead of a lattice, this value will likely be ignored by the underlying
+   *          implementation, but would normally be defined as <code>endIndex-startIndex</code>
+   * @return true if there is a rule for this span
+   */
+  boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength);
+
+  /**
+   * Gets the number of rules stored in the grammar.
+   * 
+   * @return the number of rules stored in the grammar
+   */
+  int getNumRules();
+  
+  /**
+   * Returns the number of dense features.
+   * 
+   * @return the number of dense features
+   */
+  int getNumDenseFeatures();
+
+  /**
+   * Return the grammar's owner.
+   * @return grammar owner
+   */
+  int getOwner();
+
+  /**
+   * Return the maximum source phrase length (terminals + nonterminals)
+   * @return the maximum source phrase length
+   */
+  int getMaxSourcePhraseLength();
+  
+  /**
+   * Add an OOV rule for the requested word for the grammar.
+   * 
+   * @param word input word to add rules to
+   * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   */
+  void addOOVRules(int word, List<FeatureFunction> featureFunctions);
+  
+  /**
+   * Add a rule to the grammar.
+   *
+   * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
+   */
+  void addRule(Rule rule);
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
new file mode 100644
index 0000000..df00255
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a base class for simple, ASCII line-based grammars that are stored on disk.
+ * 
+ * @author Juri Ganitkevitch
+ * 
+ */
+public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(GrammarReader.class);
+
+  protected static String fieldDelimiter;
+  protected static String description;
+
+  protected String fileName;
+  protected LineReader reader;
+  protected String lookAhead;
+  protected int numRulesRead;
+
+
+  // dummy constructor for
+  public GrammarReader() {
+    this.fileName = null;
+  }
+
+  public GrammarReader(String fileName) throws IOException {
+    this.fileName = fileName;
+    this.reader = new LineReader(fileName);
+    LOG.info("Reading grammar from file {}...", fileName);
+    numRulesRead = 0;
+    advanceReader();
+  }
+
+  // the reader is the iterator itself
+  public Iterator<R> iterator() {
+    return this;
+  }
+
+  /** Unsupported Iterator method. */
+  public void remove() throws UnsupportedOperationException {
+    throw new UnsupportedOperationException();
+  }
+
+  public void close() {
+    if (null != this.reader) {
+      try {
+        this.reader.close();
+      } catch (IOException e) {
+        LOG.warn(e.getMessage(), e);
+        LOG.error("Error closing grammar file stream: {}",  this.fileName);
+      }
+      this.reader = null;
+    }
+  }
+
+  /**
+   * For correct behavior <code>close</code> must be called on every GrammarReader, however this
+   * code attempts to avoid resource leaks.
+   * 
+   * @see org.apache.joshua.util.io.LineReader
+   */
+  @Override
+  protected void finalize() throws Throwable {
+    if (this.reader != null) {
+      LOG.error("Grammar file stream was not closed, this indicates a coding error: {}",
+          this.fileName);
+    }
+
+    this.close();
+    super.finalize();
+  }
+
+  @Override
+  public boolean hasNext() {
+    return lookAhead != null;
+  }
+
+  private void advanceReader() {
+    try {
+      lookAhead = reader.readLine();
+      numRulesRead++;
+    } catch (IOException e) {
+      LOG.error("Error reading grammar from file: {}", fileName);
+      LOG.error(e.getMessage(), e);
+    }
+    if (lookAhead == null && reader != null) {
+      this.close();
+    }
+  }
+
+  /**
+   * Read the next line, and print reader progress.
+   */
+  @Override
+  public R next() {
+    String line = lookAhead;
+
+    int oldProgress = reader.progress();
+    advanceReader();
+
+
+    if (Decoder.VERBOSE >= 1) {
+      int newProgress = (reader != null) ? reader.progress() : 100;
+
+      //TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %!
+      if (newProgress > oldProgress) {
+        for (int i = oldProgress + 1; i <= newProgress; i++)
+          if (i == 97) {
+            System.err.print("1");
+          } else if (i == 98) {
+            System.err.print("0");
+          } else if (i == 99) {
+            System.err.print("0");
+          } else if (i == 100) {
+            System.err.println("%");
+          } else if (i % 10 == 0) {
+            System.err.print(String.format("%d", i));
+            System.err.flush();
+          } else if ((i - 1) % 10 == 0)
+            ; // skip at 11 since 10, 20, etc take two digits
+          else {
+            System.err.print(".");
+            System.err.flush();
+          }
+      }
+    }
+    return parseLine(line);
+  }
+
+  protected abstract R parseLine(String line);
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
new file mode 100644
index 0000000..0e5a4bc
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.ArrayList;
+import java.util.Arrays;  
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Supplier;
+import com.google.common.base.Suppliers;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class define the interface for Rule. 
+ * 
+ * All feature scores are interpreted as negative log probabilities, and are therefore negated.
+ * Note that not all features need to be negative log probs, but you should be aware that they
+ * will be negated, so if you want a positive count, it should come in as negative.
+ * 
+ * Normally, the feature score in the rule should be *cost* (i.e., -LogP), so that the feature
+ * weight should be positive
+ * 
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class Rule implements Comparator<Rule>, Comparable<Rule> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Rule.class);
+  private int lhs; // tag of this rule
+  private int[] source; // pointer to the RuleCollection, as all the rules under it share the same
+                         // Source side
+  protected int arity;
+
+  // And a string containing the sparse ones
+  //protected final String sparseFeatureString;
+  protected final Supplier<String> sparseFeatureStringSupplier;
+  private final Supplier<FeatureVector> featuresSupplier;
+
+  /*
+   * a feature function will be fired for this rule only if the owner of the rule matches the owner
+   * of the feature function
+   */
+  private int owner = -1;
+
+  /**
+   * This is the cost computed only from the features present with the grammar rule. This cost is
+   * needed to sort the rules in the grammar for cube pruning, but isn't the full cost of applying
+   * the rule (which will include contextual features that can't be computed until the rule is
+   * applied).
+   */
+  private float estimatedCost = Float.NEGATIVE_INFINITY;
+
+  private float precomputableCost = Float.NEGATIVE_INFINITY;
+
+  private int[] target;
+
+  // The alignment string, e.g., 0-0 0-1 1-1 2-1
+  private String alignmentString;
+  private final Supplier<byte[]> alignmentSupplier;
+
+  /**
+   * Constructs a new rule using the provided parameters. Rule id for this rule is
+   * undefined. Note that some of the sparse features may be unlabeled, but they cannot be mapped to
+   * their default names ("tm_OWNER_INDEX") until later, when we know the owner of the rule. This is
+   * not known until the rule is actually added to a grammar in Grammar::addRule().
+   * 
+   * Constructor used by other constructors below;
+   * 
+   * @param lhs Left-hand side of the rule.
+   * @param source Source language right-hand side of the rule.
+   * @param target Target language right-hand side of the rule.
+   * @param sparseFeatures Feature value scores for the rule.
+   * @param arity Number of nonterminals in the source language right-hand side.
+   * @param owner todo
+   */
+  public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) {
+    this.lhs = lhs;
+    this.source = source;
+    this.arity = arity;
+    this.owner = owner;
+    this.target = target;
+    this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
+    this.featuresSupplier = initializeFeatureSupplierFromString();
+    this.alignmentSupplier = initializeAlignmentSupplier();
+  }
+  
+  /**
+   * Constructor used by PackedGrammar's sortRules()
+   * @param lhs todo
+   * @param sourceRhs todo
+   * @param targetRhs todo
+   * @param features todo
+   * @param arity todo
+   * @param owner todo
+   */
+  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
+    this.lhs = lhs;
+    this.source = sourceRhs;
+    this.arity = arity;
+    this.owner = owner;
+    this.target = targetRhs;
+    this.featuresSupplier = Suppliers.memoize(() -> { return features; });
+    this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+    this.alignmentSupplier = initializeAlignmentSupplier();
+  }
+
+  /**
+   * Constructor used for SamtFormatReader and GrammarBuilderWalkerFunction's getRuleWithSpans()
+   * Owner set to -1
+   * @param lhs todo
+   * @param sourceRhs todo
+   * @param targetRhs todo
+   * @param sparseFeatures todo
+   * @param arity todo
+   */
+  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) {
+    this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1);
+  }
+
+  /**
+   * Constructor used for addOOVRules(), HieroFormatReader and PhraseRule.
+   * @param lhs todo
+   * @param sourceRhs todo
+   * @param targetRhs todo
+   * @param sparseFeatures todo
+   * @param arity todo
+   * @param alignment todo
+   */
+  public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) {
+    this(lhs, sourceRhs, targetRhs, sparseFeatures, arity);
+    this.alignmentString = alignment;
+  }
+  
+  /**
+   * Constructor (implicitly) used by PackedRule
+   */
+  public Rule() {
+    this.lhs = -1;
+    this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+    this.featuresSupplier = initializeFeatureSupplierFromString();
+    this.alignmentSupplier = initializeAlignmentSupplier();
+  }
+
+  // ==========================================================================
+  // Lazy loading Suppliers for alignments, feature vector, and feature strings
+  // ==========================================================================
+  
+  private Supplier<byte[]> initializeAlignmentSupplier(){
+    return Suppliers.memoize(() ->{
+      byte[] alignment = null;
+      String alignmentString = getAlignmentString();
+      if (alignmentString != null) {
+        String[] tokens = alignmentString.split("[-\\s]+");
+        alignment = new byte[tokens.length];
+        for (int i = 0; i < tokens.length; i++)
+          alignment[i] = (byte) Short.parseShort(tokens[i]);
+      }
+      return alignment;
+    });
+  }
+  
+  /**
+   * If Rule was constructed with sparseFeatures String, we lazily populate the
+   * FeatureSupplier.
+   */
+  private Supplier<FeatureVector> initializeFeatureSupplierFromString(){
+    return Suppliers.memoize(() ->{
+      if (owner != -1) {
+        return new FeatureVector(getFeatureString(), "tm_" + Vocabulary.word(owner) + "_");
+      } else {
+        return new FeatureVector();
+      }
+    });
+  }
+  
+  /**
+   * If Rule was constructed with a FeatureVector, we lazily populate the sparseFeaturesStringSupplier.
+   */
+  private Supplier<String> initializeSparseFeaturesStringSupplier() {
+    return Suppliers.memoize(() -> {
+      return getFeatureVector().toString();
+    });
+  }
+
+  // ===============================================================
+  // Attributes
+  // ===============================================================
+
+  public void setEnglish(int[] eng) {
+    this.target = eng;
+  }
+
+  public int[] getEnglish() {
+    return this.target;
+  }
+
+  /**
+   * Two Rules are equal of they have the same LHS, the same source RHS and the same target
+   * RHS.
+   * 
+   * @param o the object to check for equality
+   * @return true if o is the same Rule as this rule, false otherwise
+   */
+  public boolean equals(Object o) {
+    if (!(o instanceof Rule)) {
+      return false;
+    }
+    Rule other = (Rule) o;
+    if (getLHS() != other.getLHS()) {
+      return false;
+    }
+    if (!Arrays.equals(getFrench(), other.getFrench())) {
+      return false;
+    }
+    if (!Arrays.equals(target, other.getEnglish())) {
+      return false;
+    }
+    return true;
+  }
+
+  public int hashCode() {
+    // I just made this up. If two rules are equal they'll have the
+    // same hashcode. Maybe someone else can do a better job though?
+    int frHash = Arrays.hashCode(getFrench());
+    int enHash = Arrays.hashCode(target);
+    return frHash ^ enHash ^ getLHS();
+  }
+
+  // ===============================================================
+  // Attributes
+  // ===============================================================
+
+  public void setArity(int arity) {
+    this.arity = arity;
+  }
+
+  public int getArity() {
+    return this.arity;
+  }
+
+  public void setOwner(int owner) {
+    this.owner = owner;
+  }
+
+  public int getOwner() {
+    return this.owner;
+  }
+
+  public void setLHS(int lhs) {
+    this.lhs = lhs;
+  }
+
+  public int getLHS() {
+    return this.lhs;
+  }
+
+  public void setFrench(int[] french) {
+    this.source = french;
+  }
+
+  public int[] getFrench() {
+    return this.source;
+  }
+
+  /**
+   * This function does the work of turning the string version of the sparse features (passed in
+   * when the rule was created) into an actual set of features. This is a bit complicated because we
+   * support intermingled labeled and unlabeled features, where the unlabeled features are mapped to
+   * a default name template of the form "tm_OWNER_INDEX".
+   * 
+   * This function returns the dense (phrasal) features discovered when the rule was loaded. Dense
+   * features are the list of unlabeled features that preceded labeled ones. They can also be
+   * specified as labeled features of the form "tm_OWNER_INDEX", but the former format is preferred.
+   * 
+   * @return the {@link org.apache.joshua.decoder.ff.FeatureVector} for this rule
+   */
+  public FeatureVector getFeatureVector() {
+    return featuresSupplier.get();
+  }
+
+  /**
+   * This function returns the estimated cost of a rule, which should have been computed when the
+   * grammar was first sorted via a call to Rule::estimateRuleCost(). This function is a getter
+   * only; it will not compute the value if it has not already been set. It is necessary in addition
+   * to estimateRuleCost(models) because sometimes the value needs to be retrieved from contexts
+   * that do not have access to the feature functions.
+   * 
+   * This function is called by the rule comparator when sorting the grammar. As such it may be
+   * called many times and any implementation of it should be a cached implementation.
+   * 
+   * @return the estimated cost of the rule (a lower bound on the true cost)
+   */
+  public float getEstimatedCost() {
+    return estimatedCost;
+  }
+
+  /**
+   * Precomputable costs is the inner product of the weights found on each grammar rule and the
+   * weight vector. This is slightly different from the estimated rule cost, which can include other
+   * features (such as a language model estimate). This getter and setter should also be cached, and
+   * is basically provided to allow the PhraseModel feature to cache its (expensive) computation for
+   * each rule.
+   *
+   * The weights are passed in as dense weights and sparse weights. This allows the dense weight
+   * precomputation to be even faster (since we don't have to query a hash map. 
+   *
+   * @param dense_weights the dense weights from the model
+   * @param weights the sparse weights from the model
+   */
+  public void setPrecomputableCost(float[] dense_weights, FeatureVector weights) {
+    float cost = 0.0f;
+    FeatureVector features = getFeatureVector();
+    for (int i = 0; i < features.getDenseFeatures().size() && i < dense_weights.length; i++) {
+      cost += dense_weights[i] * features.getDense(i);
+    }
+
+    for (String key: features.getSparseFeatures().keySet()) {
+      cost += weights.getSparse(key) * features.getSparse(key);
+    }
+    
+    this.precomputableCost = cost;
+  }
+  
+  /**
+   * @return the precomputed model cost of each rule
+   */
+  public float getPrecomputableCost() {
+    return precomputableCost;
+  }
+  
+  public float getDenseFeature(int k) {
+    return getFeatureVector().getDense(k);
+  }
+  
+  /**
+   * This function estimates the cost of a rule, which is used for sorting the rules for cube
+   * pruning. The estimated cost is basically the set of precomputable features (features listed
+   * along with the rule in the grammar file) along with any other estimates that other features
+   * would like to contribute (e.g., a language model estimate). This cost will be a lower bound on
+   * the rule's actual cost.
+   * 
+   * The value of this function is used only for sorting the rules. When the rule is later applied
+   * in context to particular hypernodes, the rule's actual cost is computed.
+   * 
+   * @param models the list of models available to the decoder
+   * @return estimated cost of the rule
+   */
+  public float estimateRuleCost(List<FeatureFunction> models) {
+    if (null == models)
+      return 0.0f;
+
+    if (this.estimatedCost <= Float.NEGATIVE_INFINITY) {
+      this.estimatedCost = 0.0f; // weights.innerProduct(computeFeatures());
+
+      LOG.debug("estimateCost({} ;; {})", getFrenchWords(), getEnglishWords());
+      for (FeatureFunction ff : models) {
+        float val = ff.estimateCost(this, null);
+        LOG.debug("  FEATURE {} -> {}", ff.getName(), val);
+        this.estimatedCost += val; 
+      }
+    }
+    
+    return estimatedCost;
+  }
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    sb.append(Vocabulary.word(this.getLHS()));
+    sb.append(" ||| ");
+    sb.append(getFrenchWords());
+    sb.append(" ||| ");
+    sb.append(getEnglishWords());
+    sb.append(" |||");
+    sb.append(" " + getFeatureVector());
+    sb.append(String.format(" ||| est=%.3f", getEstimatedCost()));
+    sb.append(String.format(" pre=%.3f", getPrecomputableCost()));
+    return sb.toString();
+  }
+  
+  /**
+   * Returns a version of the rule suitable for reading in from a text file.
+   * 
+   * @return string version of the rule
+   */
+  public String textFormat() {
+    StringBuffer sb = new StringBuffer();
+    sb.append(Vocabulary.word(this.getLHS()));
+    sb.append(" |||");
+    
+    int nt = 1;
+    for (int i = 0; i < getFrench().length; i++) {
+      if (getFrench()[i] < 0)
+        sb.append(" " + Vocabulary.word(getFrench()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+      else
+        sb.append(" " + Vocabulary.word(getFrench()[i]));
+    }
+    sb.append(" |||");
+    nt = 1;
+    for (int i = 0; i < getEnglish().length; i++) {
+      if (getEnglish()[i] < 0)
+        sb.append(" " + Vocabulary.word(getEnglish()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+      else
+        sb.append(" " + Vocabulary.word(getEnglish()[i]));
+    }
+    sb.append(" |||");
+    sb.append(" " + getFeatureString());
+    if (getAlignmentString() != null)
+      sb.append(" ||| " + getAlignmentString());
+    return sb.toString();
+  }
+
+  public String getFeatureString() {
+    return sparseFeatureStringSupplier.get();
+  }
+
+  /**
+   * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired,
+   * with position i indexing the source and i+1 the target.
+   * 
+   * @return a byte[] from the {@link com.google.common.base.Supplier}
+   */
+  public byte[] getAlignment() {
+    return this.alignmentSupplier.get();
+  }
+  
+  public String getAlignmentString() {
+    return this.alignmentString;
+  }
+
+  /**
+   * The nonterminals on the English side are pointers to the source side nonterminals (-1 and -2),
+   * rather than being directly encoded. These number indicate the correspondence between the
+   * nonterminals on each side, introducing a level of indirection however when we want to resolve
+   * them. So to get the ID, we need to look up the corresponding source side ID.
+   * 
+   * @return The string of English words
+   */
+  public String getEnglishWords() {
+    int[] foreignNTs = getForeignNonTerminals();
+  
+    StringBuilder sb = new StringBuilder();
+    for (Integer index : getEnglish()) {
+      if (index >= 0)
+        sb.append(Vocabulary.word(index) + " ");
+      else
+        sb.append(Vocabulary.word(foreignNTs[-index - 1]).replace("]",
+            String.format(",%d] ", Math.abs(index))));
+    }
+  
+    return sb.toString().trim();
+  }
+
+  public boolean isTerminal() {
+    for (int i = 0; i < getEnglish().length; i++)
+      if (getEnglish()[i] < 0)
+        return false;
+  
+    return true;
+  }
+
+  /**
+   * Return the French (source) nonterminals as list of Strings
+   * 
+   * @return a list of strings
+   */
+  public int[] getForeignNonTerminals() {
+    int[] nts = new int[getArity()];
+    int index = 0;
+    for (int id : getFrench())
+      if (id < 0)
+        nts[index++] = -id;
+    return nts;
+  }
+  
+  /**
+   * Returns an array of size getArity() containing the source indeces of non terminals.
+   * 
+   * @return an array of size getArity() containing the source indeces of non terminals
+   */
+  public int[] getNonTerminalSourcePositions() {
+    int[] nonTerminalPositions = new int[getArity()];
+    int ntPos = 0;
+    for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) {
+      if (getFrench()[sourceIdx] < 0)
+        nonTerminalPositions[ntPos++] = sourceIdx;
+    }
+    return nonTerminalPositions;
+  }
+  
+  /**
+   * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions.
+   * Used by the WordAlignmentExtractor.
+   * 
+   * @return a {@link java.util.Map} of alignments
+   */
+  public Map<Integer, List<Integer>> getAlignmentMap() {
+    byte[] alignmentArray = getAlignment();
+    Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, List<Integer>>();
+    if (alignmentArray != null) {
+      for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; alignmentIdx += 2 ) {
+        int s = alignmentArray[alignmentIdx];
+        int t = alignmentArray[alignmentIdx + 1];
+        List<Integer> values = alignmentMap.get(t);
+        if (values == null)
+          alignmentMap.put(t, values = new ArrayList<Integer>());
+        values.add(s);
+      }
+    }
+    return alignmentMap;
+  }
+
+  /**
+   * Return the English (target) nonterminals as list of Strings
+   * 
+   * @return list of strings
+   */
+  public int[] getEnglishNonTerminals() {
+    int[] nts = new int[getArity()];
+    int[] foreignNTs = getForeignNonTerminals();
+    int index = 0;
+  
+    for (int i : getEnglish()) {
+      if (i < 0)
+        nts[index++] = foreignNTs[Math.abs(getEnglish()[i]) - 1];
+    }
+  
+    return nts;
+  }
+
+  private int[] getNormalizedEnglishNonterminalIndices() {
+    int[] result = new int[getArity()];
+  
+    int ntIndex = 0;
+    for (Integer index : getEnglish()) {
+      if (index < 0)
+        result[ntIndex++] = -index - 1;
+    }
+  
+    return result;
+  }
+
+  public boolean isInverting() {
+    int[] normalizedEnglishNonTerminalIndices = getNormalizedEnglishNonterminalIndices();
+    if (normalizedEnglishNonTerminalIndices.length == 2) {
+      if (normalizedEnglishNonTerminalIndices[0] == 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public String getFrenchWords() {
+    return Vocabulary.getWords(getFrench());
+  }
+
+  public static final String NT_REGEX = "\\[[^\\]]+?\\]";
+
+  private Pattern getPattern() {
+    String source = getFrenchWords();
+    String pattern = Pattern.quote(source);
+    pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
+    pattern = pattern.replaceAll("\\\\Q\\\\E", "");
+    pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
+    return Pattern.compile(pattern);
+  }
+
+  /**
+   * Matches the string representation of the rule's source side against a sentence
+   * 
+   * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+   * @return true if there is a match
+   */
+  public boolean matches(Sentence sentence) {
+    boolean match = getPattern().matcher(sentence.fullSource()).find();
+    // System.err.println(String.format("match(%s,%s) = %s", Pattern.quote(getFrenchWords()),
+    // sentence.annotatedSource(), match));
+    return match;
+  }
+
+  /**
+   * This comparator is used for sorting the rules during cube pruning. An estimate of the cost
+   * of each rule is computed and used to sort. 
+   */
+  public static Comparator<Rule> EstimatedCostComparator = new Comparator<Rule>() {
+    public int compare(Rule rule1, Rule rule2) {
+      float cost1 = rule1.getEstimatedCost();
+      float cost2 = rule2.getEstimatedCost();
+      return Float.compare(cost2,  cost1);
+    }
+  };
+  
+  public int compare(Rule rule1, Rule rule2) {
+    return EstimatedCostComparator.compare(rule1, rule2);
+  }
+
+  public int compareTo(Rule other) {
+    return EstimatedCostComparator.compare(this, other);
+  }
+
+  public String getRuleString() {
+    return String.format("%s -> %s ||| %s", Vocabulary.word(getLHS()), getFrenchWords(), getEnglishWords());
+  }
+}