You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:48 UTC
[37/60] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
new file mode 100644
index 0000000..a45c41b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * A RuleCollection represents a set of rules that share the same source side (and hence the same
+ * arity). These rules are likely stored together in a Trie data structure, although the interface
+ * allows any implementation to be used.
+ * 
+ * @author Zhifei Li
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ */
+public interface RuleCollection {
+
+  /**
+   * Returns true if the rules are sorted. This is used to allow rules to be sorted in an amortized
+   * fashion; rather than sorting all trie nodes when the grammar is originally loaded, we sort them
+   * only as the decoder actually needs them.
+   * @return true if rules are sorted
+   */
+  boolean isSorted();
+
+  /**
+   * This returns a list of the rules, sorting them if necessary.
+   * 
+   * Implementations of this function should be synchronized.
+   * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   * @return the {@link java.util.List} of sorted rules
+   */
+  List<Rule> getSortedRules(List<FeatureFunction> models);
+
+  /**
+   * Get the list of rules. There are no guarantees about whether they're sorted or not.
+   * @return the {@link java.util.List} of rules, there is no gurantee they will be sorted
+   */
+  List<Rule> getRules();
+
+  /**
+   * Gets the source side for all rules in this RuleCollection. This source side is the same for all
+   * the rules in the RuleCollection.
+   * 
+   * @return the (common) source side for all rules in this RuleCollection
+   */
+  int[] getSourceSide();
+
+  /**
+   * Gets the number of nonterminals in the source side of the rules in this RuleCollection. The
+   * source side is the same for all the rules in the RuleCollection, so the arity will also be the
+   * same for all of these rules.
+   * 
+   * @return the (common) number of nonterminals in the source side of the rules in this
+   *         RuleCollection
+   */
+  int getArity();
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
new file mode 100644
index 0000000..c952b05
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
+ * trie, a subset of the original trie, that only contains trie paths that are reachable from
+ * traversals of the current sentence.
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
+
+  private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
+
+  private AbstractGrammar baseGrammar;
+  private SentenceFilteredTrie filteredTrie;
+  private int[] tokens;
+  private Sentence sentence;
+
+  /**
+   * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
+   * from the base grammar, which contains the complete grammar).
+   * 
+   * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
+   * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+   */
+  SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
+    super(baseGrammar.joshuaConfiguration);
+    this.baseGrammar = baseGrammar;
+    this.sentence = sentence;
+    this.tokens = sentence.getWordIDs();
+
+    int origCount = getNumRules(baseGrammar.getTrieRoot());
+    long startTime = System.currentTimeMillis();
+
+    /* Filter the rules; returns non-null object */
+    this.filteredTrie = filter(baseGrammar.getTrieRoot());
+    int filteredCount = getNumRules();
+
+    float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
+
+    LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
+        sentence.id(), origCount, filteredCount, seconds);
+  }
+
+  @Override
+  public Trie getTrieRoot() {
+    return filteredTrie;
+  }
+
+  /**
+   * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
+   * current span, but whether the grammar is permitted to apply rules to the current span (a
+   * grammar-level parameter). As such we can just chain to the underlying grammar.
+   */
+  @Override
+  public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
+    return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
+  }
+
+  @Override
+  public int getNumRules() {
+    return getNumRules(getTrieRoot());
+  }
+
+  /**
+   * A convenience function that counts the number of rules in a grammar's trie.
+   * 
+   * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
+   * @return the number of rules
+   */
+  public int getNumRules(Trie node) {
+    int numRules = 0;
+    if (node != null) {
+      if (node.getRuleCollection() != null)
+        numRules += node.getRuleCollection().getRules().size();
+
+      if (node.getExtensions() != null)
+        for (Trie child : node.getExtensions())
+          numRules += getNumRules(child);
+    }
+
+    return numRules;
+  }
+
+  /**
+   * What is the algorithm?
+   * 
+   * Take the first word of the sentence, and start at the root of the trie. There are two things to
+   * consider: (a) word matches and (b) nonterminal matches.
+   * 
+   * For a word match, simply follow that arc along the trie. We create a parallel arc in our
+   * filtered grammar to represent it. Each arc in the filtered trie knows about its
+   * corresponding/underlying node in the unfiltered grammar trie.
+   * 
+   * A nonterminal is always permitted to match. The question then is how much of the input sentence
+   * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
+   * has to be a set of calls, one each to the next trie node with different lengths of the sentence
+   * remaining.
+   * 
+   * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
+   * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
+   * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
+   * subsequence, because with respect to filtering rules, they are all the same.
+   * 
+   * We accomplish this with the following restriction: for purposes of grammar filtering, only the
+   * first in a sequence of nonterminal traversals can consume more than one word. Each of the
+   * subsequent ones would have to consume just one word. We then just have to record in the
+   * recursive call whether the last traversal was a nonterminal or not.
+   * 
+   * @param unfilteredTrieRoot todo
+   * @return the root of the filtered trie
+   */
+  private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
+    SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
+
+    // System.err.println(String.format("FILTERING TO SENTENCE\n  %s\n",
+    // Vocabulary.getWords(tokens)));
+
+    /*
+     * The root of the trie is where rule applications start, so we simply try all possible
+     * positions in the sentence.
+     */
+    for (int i = 0; i < tokens.length; i++) {
+      filter(i, filteredTrieRoot, false);
+    }
+
+    return filteredTrieRoot;
+  }
+
+  /**
+   * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
+   * Marks arcs that are traversable for this sentence.
+   * 
+   * @param i the position in the sentence to start matching
+   * @param trie the trie node to match against
+   * @param lastWasNT true if the match that brought us here was against a nonterminal
+   */
+  private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
+    if (i >= tokens.length)
+      return;
+
+    /* Make sure the underlying unfiltered node has children. */
+    Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
+    if (unfilteredTrieNode.getChildren() == null) {
+      // trieNode.path.retreat();
+      return;
+    }
+
+    /* Match a word */
+    Trie trie = unfilteredTrieNode.match(tokens[i]);
+    if (trie != null) {
+      /*
+       * The current filtered node might already have an arc for this label. If so, retrieve it
+       * (since we still need to follow it); if not, create it.
+       */
+      SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
+      if (nextFilteredTrie == null) {
+        nextFilteredTrie = new SentenceFilteredTrie(trie);
+        trieNode.children.put(tokens[i], nextFilteredTrie);
+      }
+
+      /*
+       * Now continue, trying to match the child node against the next position in the sentence. The
+       * third argument records that this match was not against a nonterminal.
+       */
+      filter(i + 1, nextFilteredTrie, false);
+    }
+
+    /*
+     * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
+     * sentence, up to the maximum span for that grammar. So we enumerate all children of the
+     * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
+     * less than 0), then recurse.
+     * 
+     * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
+     * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
+     * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
+     * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
+     * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
+     */
+    HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
+    if (children != null) {
+      for (int label : children.keySet()) {
+        if (label < 0) {
+          SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
+          if (nextFilteredTrie == null) {
+            nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
+            trieNode.children.put(label, nextFilteredTrie);
+          }
+
+          /*
+           * Recurse. If the last match was a nonterminal, we can only consume one more token.
+           * 
+           * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
+           * span limit which should be consulted. What we should be doing is passing the point
+           * where we started matching the current sentence, so we can apply this span limit, which
+           * is easily accessible (baseGrammar.spanLimit).
+           */
+          int maxJ = lastWasNT ? (i + 1) : tokens.length;
+          for (int j = i + 1; j <= maxJ; j++) {
+            filter(j, nextFilteredTrie, true);
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Alternate filter that uses regular expressions, walking the grammar trie and matching the
+   * source side of each rule collection against the input sentence. Failed matches are discarded,
+   * and trie nodes extending from that position need not be explored.
+   * 
+   * @param unfilteredTrie todo
+   * @return the root of the filtered trie if any rules were retained, otherwise null
+   */
+  @SuppressWarnings("unused")
+  private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
+    SentenceFilteredTrie trie = null;
+
+    /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
+    if (unfilteredTrie.hasRules())
+      if (matchesSentence(unfilteredTrie))
+        trie = new SentenceFilteredTrie(unfilteredTrie);
+      else
+        return null;
+
+    /* Case 2: keep the trie node if it has children who have valid rule collections */
+    if (unfilteredTrie.hasExtensions())
+      for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
+        Trie unfilteredChildTrie = arc.getValue();
+        SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
+        if (nextTrie != null) {
+          if (trie == null)
+            trie = new SentenceFilteredTrie(unfilteredTrie);
+          trie.children.put(arc.getKey(), nextTrie);
+        }
+      }
+
+    return trie;
+  }
+
+  private boolean matchesSentence(Trie childTrie) {
+    Rule rule = childTrie.getRuleCollection().getRules().get(0);
+    return rule.matches(sentence);
+  }
+
+  /**
+   * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
+   * the given input sentence.
+   * 
+   * @author Matt Post post@cs.jhu.edu
+   * 
+   */
+  public class SentenceFilteredTrie implements Trie {
+
+    /* The underlying unfiltered trie node. */
+    private Trie unfilteredTrieNode;
+
+    /* The child nodes in the filtered trie. */
+    private HashMap<Integer, SentenceFilteredTrie> children = null;
+
+    /**
+     * Constructor.
+     * 
+     * @param unfilteredTrieNode todo
+     */
+    public SentenceFilteredTrie(Trie unfilteredTrieNode) {
+      this.unfilteredTrieNode = unfilteredTrieNode;
+      this.children = new HashMap<Integer, SentenceFilteredTrie>();
+    }
+
+    @Override
+    public SentenceFilteredTrie match(int wordID) {
+      if (children != null)
+        return children.get(wordID);
+      return null;
+    }
+
+    @Override
+    public boolean hasExtensions() {
+      return children != null;
+    }
+
+    @Override
+    public Collection<SentenceFilteredTrie> getExtensions() {
+      if (children != null)
+        return children.values();
+
+      return null;
+    }
+
+    @Override
+    public HashMap<Integer, SentenceFilteredTrie> getChildren() {
+      return children;
+    }
+
+    @Override
+    public boolean hasRules() {
+      // Chain to the underlying unfiltered node.
+      return unfilteredTrieNode.hasRules();
+    }
+
+    @Override
+    public RuleCollection getRuleCollection() {
+      // Chain to the underlying unfiltered node, since the rule collection just varies by target
+      // side.
+      return unfilteredTrieNode.getRuleCollection();
+    }
+
+    /**
+     * Counts the number of rules.
+     * 
+     * @return the number of rules rooted at this node.
+     */
+    public int getNumRules() {
+      int numRules = 0;
+      if (getTrieRoot() != null)
+        if (getTrieRoot().getRuleCollection() != null)
+          numRules += getTrieRoot().getRuleCollection().getRules().size();
+
+      for (SentenceFilteredTrie node : getExtensions())
+        numRules += node.getNumRules();
+
+      return numRules;
+    }
+
+    @Override
+    public Iterator<Integer> getTerminalExtensionIterator() {
+      return new ExtensionIterator(children, true);
+    }
+
+    @Override
+    public Iterator<Integer> getNonterminalExtensionIterator() {
+      return new ExtensionIterator(children, false);
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
new file mode 100644
index 0000000..51d2dd8
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * An interface for trie-like data structures.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public interface Trie {
+
+  /**
+   * Traverse one ply further down the trie. If there is no match, the result is null.
+   * 
+   * @param wordID input word ID
+   * @return Child node of this trie
+   */
+  Trie match(int wordID);
+
+  
+  /**
+   * Returns whether matchOne(Symbol) could succeed for any symbol.
+   * 
+   * @return <code>true</code> if {@link #match(int)} could succeed for some symbol,
+   *         <code>false</code> otherwise
+   */
+  boolean hasExtensions();
+
+
+  /**
+   * If the trie node has extensions, then return a list of extended trie nodes, otherwise return
+   * null.
+   * 
+   * @return A list of extended <code>Trie</code> nodes if this node has extensions,
+   *         <code>null</code>
+   *         otherwise
+   */
+  Collection<? extends Trie> getExtensions();
+
+
+  /**
+   * If the trie node has extensions, get a {@link java.util.HashMap} of their labels.
+   * 
+   * @return a {@link java.util.HashMap} pf node extensions
+   */
+  HashMap<Integer,? extends Trie> getChildren();
+
+  /**
+   * Returns an iterator over the trie node's extensions with terminal labels.
+   * 
+   * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels
+   */
+  Iterator<Integer> getTerminalExtensionIterator();
+  
+  /**
+   * Returns an iterator over the trie node's extensions with nonterminal labels.
+   * 
+   * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels
+   */
+  Iterator<Integer> getNonterminalExtensionIterator();
+  
+  
+  /**
+   * Gets whether the current node/state is a "final state" that has matching rules.
+   * 
+   * @return <code>true</code> if the current node/state is a "final state" that has matching rules,
+   *         <code>false</code> otherwise
+   */
+  boolean hasRules();
+
+
+  /**
+   * Retrieve the rules at the current node/state. The implementation of this method must adhere to
+   * the following laws:
+   * 
+   * <ol>
+   * <li>The return value is always non-null. The collection may be empty however.</li>
+   * <li>The collection must be empty if hasRules() is false, and must be non-empty if hasRules() is
+   * true.</li>
+   * <li>The collection must be sorted (at least as used by TMGrammar)</li>
+   * </ol>
+   * @return a {@link org.apache.joshua.decoder.ff.tm.RuleCollection} representing the rules 
+   * at the current node/state
+   */
+  RuleCollection getRuleCollection();
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
new file mode 100644
index 0000000..3358775
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+/**
+ * Unchecked runtime exception thrown to indicate that a collection of rules has not been properly
+ * sorted according to the feature functions in effect.
+ * 
+ * @author Lane Schwartz
+ */
+public class UnsortedRuleCollectionException extends RuntimeException {
+
+  private static final long serialVersionUID = -4819014771607378835L;
+
+  /**
+   * Constructs an <code>UnsortedRuleCollectionException</code> with the specified detail message.
+   * 
+   * @param message the detail message
+   */
+  public UnsortedRuleCollectionException(String message) {
+    super(message);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
new file mode 100644
index 0000000..45c8c33
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.Constants;
+import org.apache.joshua.util.FormatUtils;
+
+/**
+ * This class implements reading files in the format defined by David Chiang for Hiero. 
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class HieroFormatReader extends GrammarReader<Rule> {
+
+  static {
+    description = "Original Hiero format";
+  }
+
+  public HieroFormatReader() {
+    super();
+  }
+
+  public HieroFormatReader(String grammarFile) throws IOException {
+    super(grammarFile);
+  }
+
+  @Override
+  public Rule parseLine(String line) {
+    String[] fields = line.split(Constants.fieldDelimiter);
+    if (fields.length < 3) {
+      throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
+    }
+
+    int lhs = Vocabulary.id(fields[0]);
+
+    /**
+     * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
+     */
+    int arity = 0;
+    String[] sourceWords = fields[1].split("\\s+");
+    int[] sourceIDs = new int[sourceWords.length];
+    for (int i = 0; i < sourceWords.length; i++) {
+      /* NOTE: This redundantly creates vocab items for terms like [X,1]. This might actually
+       * be necessary, so don't try to turn this into an if/else.
+       */
+      sourceIDs[i] = Vocabulary.id(sourceWords[i]);
+      if (FormatUtils.isNonterminal(sourceWords[i])) {
+        sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
+        arity++;
+        
+        // TODO: the arity here (after incrementing) should match the rule index. Should
+        // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
+        // error if not
+      }
+    }
+
+    /**
+     * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
+     * to the index of the source-side nonterminal they are linked to. So for a rule
+     * 
+     * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
+     * 
+     * the English side nonterminals will be -2, -1, -3. This assumes that the source side of
+     * the rule is always listed monotonically.
+     */
+    String[] targetWords = fields[2].split("\\s+");
+    int[] targetIDs = new int[targetWords.length];
+    for (int i = 0; i < targetWords.length; i++) {
+      targetIDs[i] = Vocabulary.id(targetWords[i]);
+      if (FormatUtils.isNonterminal(targetWords[i])) {
+        targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
+      }
+    }
+
+    String sparse_features = (fields.length > 3 ? fields[3] : "");
+    String alignment = (fields.length > 4) ? fields[4] : null;
+
+    return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
+  }
+  
+  public static boolean isNonTerminal(final String word) {
+    return FormatUtils.isNonterminal(word);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
new file mode 100644
index 0000000..7811b3b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.io.LineReader;
+import org.apache.joshua.util.Constants;
+import org.apache.joshua.util.FormatUtils;
+
+/***
+ * This class reads in the Moses phrase table format, with support for the source and target side,
+ * list of features, and word alignments. It works by
+ * 
+ * - casting the phrase-based rules to left-branching hierarchical rules and passing them on \
+ *   to its parent class, {@link HieroFormatReader}.
+ * - converting the probabilities to -log probabilities
+ * 
+ * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:
+ * 
+ * <pre>
+ *     cat PHRASE_TABLE | java -cp $JOSHUA/target/classes org.apache.joshua.decoder.ff.tm.format.MosesFormatReader
+ * </pre>
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ *
+ */
+
+public class MosesFormatReader extends HieroFormatReader {
+
+  public MosesFormatReader(String grammarFile) throws IOException {
+    super(grammarFile);
+    Vocabulary.id(Constants.defaultNT);
+  }
+  
+  public MosesFormatReader() {
+    super();
+    Vocabulary.id(Constants.defaultNT);
+  }
+  
+  /**
+   * When dealing with Moses format, this munges a Moses-style phrase table into a grammar.
+   * 
+   *    mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+   *    
+   * becomes
+   * 
+   *    [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3  ||| 0-1 1-0
+   *    
+   * For thrax-extracted phrasal grammars, it transforms
+   * 
+   *    [X] ||| mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+   *
+   * into
+   * 
+   *    [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
+   */
+  @Override
+  public Rule parseLine(String line) {
+    String[] fields = line.split(Constants.fieldDelimiter);
+    
+    String nt = FormatUtils.cleanNonTerminal(Constants.defaultNT);
+    StringBuffer hieroLine = new StringBuffer(Constants.defaultNT + " ||| [" + nt + ",1] " + fields[0] + " ||| [" + nt + ",1] " + fields[1] + " |||");
+
+    String mosesFeatureString = fields[2];
+    for (String value: mosesFeatureString.split(" ")) {
+      float f = Float.parseFloat(value);
+      hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
+    }
+
+    // alignments
+    if (fields.length >= 4)
+      hieroLine.append(" ||| " + fields[3]);
+
+    return super.parseLine(hieroLine.toString());
+  }
+  
+  /**
+   * Converts a Moses phrase table to a Joshua grammar. 
+   * 
+   * @param args the command-line arguments
+   */
+  public static void main(String[] args) {
+    MosesFormatReader reader = new MosesFormatReader();
+    for (String line: new LineReader(System.in)) {
+      Rule rule = reader.parseLine(line);
+      System.out.println(rule.textFormat());
+    }    
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
new file mode 100644
index 0000000..ecb355d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.util.HashMap;
+import java.util.Iterator;
+
+public class ExtensionIterator implements Iterator<Integer> {
+
+  private Iterator<Integer> iterator;
+  private boolean terminal;
+  private boolean done;
+  private int next;
+
+  public ExtensionIterator(HashMap<Integer, ?> map, boolean terminal) {
+    this.terminal = terminal;
+    done = false;
+    if (map == null) {
+      done = true;
+    } else {
+      this.iterator = map.keySet().iterator();
+      forward();
+    }
+  }
+
+  private void forward() {
+    if (done)
+      return;
+    while (iterator.hasNext()) {
+      int candidate = iterator.next();
+      if ((terminal && candidate > 0) || (!terminal && candidate < 0)) {
+        next = candidate;
+        return;
+      }
+    }
+    done = true;
+  }
+
+  @Override
+  public boolean hasNext() {
+    return !done;
+  }
+
+  @Override
+  public Integer next() {
+    if (done)
+      throw new RuntimeException();
+    int consumed = next;
+    forward();
+    return consumed;
+  }
+
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
new file mode 100644
index 0000000..f346e7a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements a memory-based bilingual BatchGrammar.
+ * <p>
+ * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+ * in HashMap
+ * 
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class MemoryBasedBatchGrammar extends AbstractGrammar {
+
+  private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
+
+  // ===============================================================
+  // Instance Fields
+  // ===============================================================
+
+  /* The number of rules read. */
+  private int qtyRulesRead = 0;
+
+  /* The number of distinct source sides. */
+  private int qtyRuleBins = 0;
+
+  private int numDenseFeatures = 0;
+
+  /* The trie root. */
+  private MemoryBasedTrie root = null;
+
+  /* The file containing the grammar. */
+  private String grammarFile;
+
+  private GrammarReader<Rule> modelReader;
+
+  // ===============================================================
+  // Static Fields
+  // ===============================================================
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+
+  public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
+    super(joshuaConfiguration);
+    this.root = new MemoryBasedTrie();
+    this.joshuaConfiguration = joshuaConfiguration;
+    setSpanLimit(20);
+  }
+
+  public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
+    this(joshuaConfiguration);
+    this.owner = Vocabulary.id(owner);
+  }
+
+  public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
+    // this.defaultOwner = Vocabulary.id(defaultOwner);
+    // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
+    this(joshuaConfiguration);
+    modelReader = gr;
+  }
+
+  public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
+      String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
+      throws IOException {
+
+    this(joshuaConfiguration);
+    this.owner = Vocabulary.id(owner);
+    Vocabulary.id(defaultLHSSymbol);
+    this.spanLimit = spanLimit;
+    this.grammarFile = grammarFile;
+
+    // ==== loading grammar
+    this.modelReader = createReader(formatKeyword, grammarFile);
+    if (modelReader != null) {
+      for (Rule rule : modelReader)
+        if (rule != null) {
+          addRule(rule);
+        }
+    } else {
+      LOG.info("Couldn't create a GrammarReader for file {} with format {}",
+          grammarFile, formatKeyword);
+    }
+
+    this.printGrammar();
+  }
+
+  protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
+
+    if (grammarFile != null) {
+      if ("hiero".equals(format) || "thrax".equals(format)) {
+        return new HieroFormatReader(grammarFile);
+      } else if ("moses".equals(format)) {
+        return new MosesFormatReader(grammarFile);
+      } else {
+        throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
+      }
+    }
+    return null;
+  }
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+
+  public void setSpanLimit(int spanLimit) {
+    this.spanLimit = spanLimit;
+  }
+
+  @Override
+  public int getNumRules() {
+    return this.qtyRulesRead;
+  }
+
+  /**
+   * if the span covered by the chart bin is greater than the limit, then return false
+   */
+  public boolean hasRuleForSpan(int i, int j, int pathLength) {
+    if (this.spanLimit == -1) { // mono-glue grammar
+      return (i == 0);
+    } else {
+      // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+      // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+      return (pathLength <= this.spanLimit);
+    }
+  }
+
+  public Trie getTrieRoot() {
+    return this.root;
+  }
+
+  /**
+   * Adds a rule to the grammar.
+   */
+  public void addRule(Rule rule) {
+
+    // TODO: Why two increments?
+    this.qtyRulesRead++;
+
+    // if (owner == -1) {
+    // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
+    // System.exit(1);
+    // }
+    rule.setOwner(owner);
+
+    if (numDenseFeatures == 0)
+      numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
+
+    // === identify the position, and insert the trie nodes as necessary
+    MemoryBasedTrie pos = root;
+    int[] french = rule.getFrench();
+
+    maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+
+    for (int k = 0; k < french.length; k++) {
+      int curSymID = french[k];
+
+      /*
+       * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+       * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+       * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+       * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+       * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+       */
+
+      MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+      if (null == nextLayer) {
+        nextLayer = new MemoryBasedTrie();
+        if (pos.hasExtensions() == false) {
+          pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
+        }
+        pos.childrenTbl.put(curSymID, nextLayer);
+      }
+      pos = nextLayer;
+    }
+
+    // === add the rule into the trie node
+    if (!pos.hasRules()) {
+      pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
+      this.qtyRuleBins++;
+    }
+    pos.ruleBin.addRule(rule);
+  }
+
+  protected void printGrammar() {
+    LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+        this.qtyRulesRead, this.qtyRuleBins, grammarFile);
+  }
+
+  /***
+   * Takes an input word and creates an OOV rule in the current grammar for that word.
+   * 
+   * @param sourceWord integer representation of word
+   * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   */
+  @Override
+  public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
+
+    // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+    // almost
+    // certainly is)
+    final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
+        .word(sourceWord) + "_OOV") : sourceWord;
+
+    int[] sourceWords = { sourceWord };
+    int[] targetWords = { targetWord };
+    final String oovAlignment = "0-0";
+
+    if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
+      for (OOVItem item : this.joshuaConfiguration.oovList) {
+        Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
+            oovAlignment);
+        addRule(oovRule);
+        oovRule.estimateRuleCost(featureFunctions);
+      }
+    } else {
+      int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
+      Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
+      addRule(oovRule);
+      oovRule.estimateRuleCost(featureFunctions);
+    }
+  }
+
+  /**
+   * Adds a default set of glue rules.
+   * 
+   * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   */
+  public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
+    HieroFormatReader reader = new HieroFormatReader();
+
+    String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+    String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+    String[] ruleStrings = new String[] {
+        String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+            Vocabulary.START_SYM),
+        String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+            goalNT, defaultNT),
+        String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+            Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+    for (String ruleString : ruleStrings) {
+      Rule rule = reader.parseLine(ruleString);
+      addRule(rule);
+      rule.estimateRuleCost(featureFunctions);
+    }
+  }
+
+  @Override
+  public int getNumDenseFeatures() {
+    return numDenseFeatures;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
new file mode 100644
index 0000000..f91df1e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import org.apache.joshua.decoder.ff.tm.BasicRuleCollection;
+import org.apache.joshua.decoder.ff.tm.Rule;
+
+/**
+ * Stores a collection of all rules with the same french side (and thus same arity).
+ * 
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public class MemoryBasedRuleBin extends BasicRuleCollection {
+
+  /**
+   * Constructs an initially empty rule collection.
+   * 
+   * @param arity Number of nonterminals in the source pattern
+   * @param sourceTokens Sequence of terminals and nonterminals in the source pattern
+   */
+  public MemoryBasedRuleBin(int arity, int[] sourceTokens) {
+    super(arity, sourceTokens);
+  }
+
+  /**
+   * Adds a rule to this collection.
+   * 
+   * @param rule Rule to add to this collection.
+   */
+  public void addRule(Rule rule) {
+    // XXX This if clause seems bogus.
+    if (rules.size() <= 0) { // first time
+      this.arity = rule.getArity();
+      this.sourceTokens = rule.getFrench();
+    }
+    if (rule.getArity() != this.arity) {
+      return;
+    }
+    rules.add(rule);
+    sorted = false;
+    rule.setFrench(this.sourceTokens);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
new file mode 100644
index 0000000..998688a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.joshua.decoder.ff.tm.RuleCollection;
+import org.apache.joshua.decoder.ff.tm.Trie;
+
+/**
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public class MemoryBasedTrie implements Trie {
+  MemoryBasedRuleBin ruleBin = null;
+  HashMap<Integer, MemoryBasedTrie> childrenTbl = null;
+
+  public MemoryBasedTrie() {
+  }
+
+  @Override
+  public Trie match(int wordID) {
+    if (childrenTbl != null)
+      return childrenTbl.get(wordID);
+    return null;
+  }
+
+  /* See Javadoc for Trie interface. */
+  public boolean hasExtensions() {
+    return (null != this.childrenTbl);
+  }
+
+  public HashMap<Integer, MemoryBasedTrie> getChildren() {
+    return this.childrenTbl;
+  }
+
+  public void setExtensions(HashMap<Integer, MemoryBasedTrie> tbl_children_) {
+    this.childrenTbl = tbl_children_;
+  }
+
+  /* See Javadoc for Trie interface. */
+  public boolean hasRules() {
+    return (null != this.ruleBin);
+  }
+
+  public void setRuleBin(MemoryBasedRuleBin rb) {
+    ruleBin = rb;
+  }
+
+  /* See Javadoc for Trie interface. */
+  public RuleCollection getRuleCollection() {
+    return this.ruleBin;
+  }
+
+  /* See Javadoc for Trie interface. */
+  public Collection<MemoryBasedTrie> getExtensions() {
+    if (this.childrenTbl != null)
+      return this.childrenTbl.values();
+    return null;
+  }
+
+  @Override
+  public Iterator<Integer> getTerminalExtensionIterator() {
+    return new ExtensionIterator(childrenTbl, true);
+  }
+
+  @Override
+  public Iterator<Integer> getNonterminalExtensionIterator() {
+    return new ExtensionIterator(childrenTbl, false);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
new file mode 100644
index 0000000..695a0a4
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Provides implementations of hierarchical phrase-based translation grammars.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
new file mode 100644
index 0000000..b804db6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/** 
+ * Defines interfaces and provides infrastructure for hierarchical 
+ * phrase-based translation grammars.
+ */
+package org.apache.joshua.decoder.ff.tm;
+