You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:48 UTC
[37/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
new file mode 100644
index 0000000..a45c41b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/RuleCollection.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * A RuleCollection represents a set of rules that share the same source side (and hence the same
+ * arity). These rules are likely stored together in a Trie data structure, although the interface
+ * allows any implementation to be used.
+ *
+ * @author Zhifei Li
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ */
+public interface RuleCollection {
+
+ /**
+ * Returns true if the rules are sorted. This is used to allow rules to be sorted in an amortized
+ * fashion; rather than sorting all trie nodes when the grammar is originally loaded, we sort them
+ * only as the decoder actually needs them.
+ * @return true if rules are sorted
+ */
+ boolean isSorted();
+
+ /**
+ * This returns a list of the rules, sorting them if necessary.
+ *
+ * Implementations of this function should be synchronized.
+ * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ * @return the {@link java.util.List} of sorted rules
+ */
+ List<Rule> getSortedRules(List<FeatureFunction> models);
+
+ /**
+ * Get the list of rules. There are no guarantees about whether they're sorted or not.
+ * @return the {@link java.util.List} of rules, there is no gurantee they will be sorted
+ */
+ List<Rule> getRules();
+
+ /**
+ * Gets the source side for all rules in this RuleCollection. This source side is the same for all
+ * the rules in the RuleCollection.
+ *
+ * @return the (common) source side for all rules in this RuleCollection
+ */
+ int[] getSourceSide();
+
+ /**
+ * Gets the number of nonterminals in the source side of the rules in this RuleCollection. The
+ * source side is the same for all the rules in the RuleCollection, so the arity will also be the
+ * same for all of these rules.
+ *
+ * @return the (common) number of nonterminals in the source side of the rules in this
+ * RuleCollection
+ */
+ int getArity();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
new file mode 100644
index 0000000..c952b05
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
+ * trie, a subset of the original trie, that only contains trie paths that are reachable from
+ * traversals of the current sentence.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
+
+ private AbstractGrammar baseGrammar;
+ private SentenceFilteredTrie filteredTrie;
+ private int[] tokens;
+ private Sentence sentence;
+
+ /**
+ * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
+ * from the base grammar, which contains the complete grammar).
+ *
+ * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ */
+ SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
+ super(baseGrammar.joshuaConfiguration);
+ this.baseGrammar = baseGrammar;
+ this.sentence = sentence;
+ this.tokens = sentence.getWordIDs();
+
+ int origCount = getNumRules(baseGrammar.getTrieRoot());
+ long startTime = System.currentTimeMillis();
+
+ /* Filter the rules; returns non-null object */
+ this.filteredTrie = filter(baseGrammar.getTrieRoot());
+ int filteredCount = getNumRules();
+
+ float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
+
+ LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
+ sentence.id(), origCount, filteredCount, seconds);
+ }
+
+ @Override
+ public Trie getTrieRoot() {
+ return filteredTrie;
+ }
+
+ /**
+ * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
+ * current span, but whether the grammar is permitted to apply rules to the current span (a
+ * grammar-level parameter). As such we can just chain to the underlying grammar.
+ */
+ @Override
+ public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
+ return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
+ }
+
+ @Override
+ public int getNumRules() {
+ return getNumRules(getTrieRoot());
+ }
+
+ /**
+ * A convenience function that counts the number of rules in a grammar's trie.
+ *
+ * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
+ * @return the number of rules
+ */
+ public int getNumRules(Trie node) {
+ int numRules = 0;
+ if (node != null) {
+ if (node.getRuleCollection() != null)
+ numRules += node.getRuleCollection().getRules().size();
+
+ if (node.getExtensions() != null)
+ for (Trie child : node.getExtensions())
+ numRules += getNumRules(child);
+ }
+
+ return numRules;
+ }
+
+ /**
+ * What is the algorithm?
+ *
+ * Take the first word of the sentence, and start at the root of the trie. There are two things to
+ * consider: (a) word matches and (b) nonterminal matches.
+ *
+ * For a word match, simply follow that arc along the trie. We create a parallel arc in our
+ * filtered grammar to represent it. Each arc in the filtered trie knows about its
+ * corresponding/underlying node in the unfiltered grammar trie.
+ *
+ * A nonterminal is always permitted to match. The question then is how much of the input sentence
+ * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
+ * has to be a set of calls, one each to the next trie node with different lengths of the sentence
+ * remaining.
+ *
+ * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
+ * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
+ * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
+ * subsequence, because with respect to filtering rules, they are all the same.
+ *
+ * We accomplish this with the following restriction: for purposes of grammar filtering, only the
+ * first in a sequence of nonterminal traversals can consume more than one word. Each of the
+ * subsequent ones would have to consume just one word. We then just have to record in the
+ * recursive call whether the last traversal was a nonterminal or not.
+ *
+ * @param unfilteredTrieRoot todo
+ * @return the root of the filtered trie
+ */
+ private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
+ SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
+
+ // System.err.println(String.format("FILTERING TO SENTENCE\n %s\n",
+ // Vocabulary.getWords(tokens)));
+
+ /*
+ * The root of the trie is where rule applications start, so we simply try all possible
+ * positions in the sentence.
+ */
+ for (int i = 0; i < tokens.length; i++) {
+ filter(i, filteredTrieRoot, false);
+ }
+
+ return filteredTrieRoot;
+ }
+
+ /**
+ * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
+ * Marks arcs that are traversable for this sentence.
+ *
+ * @param i the position in the sentence to start matching
+ * @param trie the trie node to match against
+ * @param lastWasNT true if the match that brought us here was against a nonterminal
+ */
+ private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
+ if (i >= tokens.length)
+ return;
+
+ /* Make sure the underlying unfiltered node has children. */
+ Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
+ if (unfilteredTrieNode.getChildren() == null) {
+ // trieNode.path.retreat();
+ return;
+ }
+
+ /* Match a word */
+ Trie trie = unfilteredTrieNode.match(tokens[i]);
+ if (trie != null) {
+ /*
+ * The current filtered node might already have an arc for this label. If so, retrieve it
+ * (since we still need to follow it); if not, create it.
+ */
+ SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
+ if (nextFilteredTrie == null) {
+ nextFilteredTrie = new SentenceFilteredTrie(trie);
+ trieNode.children.put(tokens[i], nextFilteredTrie);
+ }
+
+ /*
+ * Now continue, trying to match the child node against the next position in the sentence. The
+ * third argument records that this match was not against a nonterminal.
+ */
+ filter(i + 1, nextFilteredTrie, false);
+ }
+
+ /*
+ * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
+ * sentence, up to the maximum span for that grammar. So we enumerate all children of the
+ * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
+ * less than 0), then recurse.
+ *
+ * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
+ * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
+ * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
+ * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
+ * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
+ */
+ HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
+ if (children != null) {
+ for (int label : children.keySet()) {
+ if (label < 0) {
+ SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
+ if (nextFilteredTrie == null) {
+ nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
+ trieNode.children.put(label, nextFilteredTrie);
+ }
+
+ /*
+ * Recurse. If the last match was a nonterminal, we can only consume one more token.
+ *
+ * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
+ * span limit which should be consulted. What we should be doing is passing the point
+ * where we started matching the current sentence, so we can apply this span limit, which
+ * is easily accessible (baseGrammar.spanLimit).
+ */
+ int maxJ = lastWasNT ? (i + 1) : tokens.length;
+ for (int j = i + 1; j <= maxJ; j++) {
+ filter(j, nextFilteredTrie, true);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Alternate filter that uses regular expressions, walking the grammar trie and matching the
+ * source side of each rule collection against the input sentence. Failed matches are discarded,
+ * and trie nodes extending from that position need not be explored.
+ *
+ * @param unfilteredTrie todo
+ * @return the root of the filtered trie if any rules were retained, otherwise null
+ */
+ @SuppressWarnings("unused")
+ private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
+ SentenceFilteredTrie trie = null;
+
+ /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
+ if (unfilteredTrie.hasRules())
+ if (matchesSentence(unfilteredTrie))
+ trie = new SentenceFilteredTrie(unfilteredTrie);
+ else
+ return null;
+
+ /* Case 2: keep the trie node if it has children who have valid rule collections */
+ if (unfilteredTrie.hasExtensions())
+ for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
+ Trie unfilteredChildTrie = arc.getValue();
+ SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
+ if (nextTrie != null) {
+ if (trie == null)
+ trie = new SentenceFilteredTrie(unfilteredTrie);
+ trie.children.put(arc.getKey(), nextTrie);
+ }
+ }
+
+ return trie;
+ }
+
+ private boolean matchesSentence(Trie childTrie) {
+ Rule rule = childTrie.getRuleCollection().getRules().get(0);
+ return rule.matches(sentence);
+ }
+
+ /**
+ * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
+ * the given input sentence.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ *
+ */
+ public class SentenceFilteredTrie implements Trie {
+
+ /* The underlying unfiltered trie node. */
+ private Trie unfilteredTrieNode;
+
+ /* The child nodes in the filtered trie. */
+ private HashMap<Integer, SentenceFilteredTrie> children = null;
+
+ /**
+ * Constructor.
+ *
+ * @param unfilteredTrieNode todo
+ */
+ public SentenceFilteredTrie(Trie unfilteredTrieNode) {
+ this.unfilteredTrieNode = unfilteredTrieNode;
+ this.children = new HashMap<Integer, SentenceFilteredTrie>();
+ }
+
+ @Override
+ public SentenceFilteredTrie match(int wordID) {
+ if (children != null)
+ return children.get(wordID);
+ return null;
+ }
+
+ @Override
+ public boolean hasExtensions() {
+ return children != null;
+ }
+
+ @Override
+ public Collection<SentenceFilteredTrie> getExtensions() {
+ if (children != null)
+ return children.values();
+
+ return null;
+ }
+
+ @Override
+ public HashMap<Integer, SentenceFilteredTrie> getChildren() {
+ return children;
+ }
+
+ @Override
+ public boolean hasRules() {
+ // Chain to the underlying unfiltered node.
+ return unfilteredTrieNode.hasRules();
+ }
+
+ @Override
+ public RuleCollection getRuleCollection() {
+ // Chain to the underlying unfiltered node, since the rule collection just varies by target
+ // side.
+ return unfilteredTrieNode.getRuleCollection();
+ }
+
+ /**
+ * Counts the number of rules.
+ *
+ * @return the number of rules rooted at this node.
+ */
+ public int getNumRules() {
+ int numRules = 0;
+ if (getTrieRoot() != null)
+ if (getTrieRoot().getRuleCollection() != null)
+ numRules += getTrieRoot().getRuleCollection().getRules().size();
+
+ for (SentenceFilteredTrie node : getExtensions())
+ numRules += node.getNumRules();
+
+ return numRules;
+ }
+
+ @Override
+ public Iterator<Integer> getTerminalExtensionIterator() {
+ return new ExtensionIterator(children, true);
+ }
+
+ @Override
+ public Iterator<Integer> getNonterminalExtensionIterator() {
+ return new ExtensionIterator(children, false);
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
new file mode 100644
index 0000000..51d2dd8
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Trie.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * An interface for trie-like data structures.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public interface Trie {
+
+ /**
+ * Traverse one ply further down the trie. If there is no match, the result is null.
+ *
+ * @param wordID input word ID
+ * @return Child node of this trie
+ */
+ Trie match(int wordID);
+
+
+ /**
+ * Returns whether matchOne(Symbol) could succeed for any symbol.
+ *
+ * @return <code>true</code> if {@link #match(int)} could succeed for some symbol,
+ * <code>false</code> otherwise
+ */
+ boolean hasExtensions();
+
+
+ /**
+ * If the trie node has extensions, then return a list of extended trie nodes, otherwise return
+ * null.
+ *
+ * @return A list of extended <code>Trie</code> nodes if this node has extensions,
+ * <code>null</code>
+ * otherwise
+ */
+ Collection<? extends Trie> getExtensions();
+
+
+ /**
+ * If the trie node has extensions, get a {@link java.util.HashMap} of their labels.
+ *
+ * @return a {@link java.util.HashMap} pf node extensions
+ */
+ HashMap<Integer,? extends Trie> getChildren();
+
+ /**
+ * Returns an iterator over the trie node's extensions with terminal labels.
+ *
+ * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels
+ */
+ Iterator<Integer> getTerminalExtensionIterator();
+
+ /**
+ * Returns an iterator over the trie node's extensions with nonterminal labels.
+ *
+ * @return the {@link java.util.Iterator} created over the trie node's extensions with terminal labels
+ */
+ Iterator<Integer> getNonterminalExtensionIterator();
+
+
+ /**
+ * Gets whether the current node/state is a "final state" that has matching rules.
+ *
+ * @return <code>true</code> if the current node/state is a "final state" that has matching rules,
+ * <code>false</code> otherwise
+ */
+ boolean hasRules();
+
+
+ /**
+ * Retrieve the rules at the current node/state. The implementation of this method must adhere to
+ * the following laws:
+ *
+ * <ol>
+ * <li>The return value is always non-null. The collection may be empty however.</li>
+ * <li>The collection must be empty if hasRules() is false, and must be non-empty if hasRules() is
+ * true.</li>
+ * <li>The collection must be sorted (at least as used by TMGrammar)</li>
+ * </ol>
+ * @return a {@link org.apache.joshua.decoder.ff.tm.RuleCollection} representing the rules
+ * at the current node/state
+ */
+ RuleCollection getRuleCollection();
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
new file mode 100644
index 0000000..3358775
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+/**
+ * Unchecked runtime exception thrown to indicate that a collection of rules has not been properly
+ * sorted according to the feature functions in effect.
+ *
+ * @author Lane Schwartz
+ */
+public class UnsortedRuleCollectionException extends RuntimeException {
+
+ private static final long serialVersionUID = -4819014771607378835L;
+
+ /**
+ * Constructs an <code>UnsortedRuleCollectionException</code> with the specified detail message.
+ *
+ * @param message the detail message
+ */
+ public UnsortedRuleCollectionException(String message) {
+ super(message);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
new file mode 100644
index 0000000..45c8c33
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.Constants;
+import org.apache.joshua.util.FormatUtils;
+
+/**
+ * This class implements reading files in the format defined by David Chiang for Hiero.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class HieroFormatReader extends GrammarReader<Rule> {
+
+ static {
+ description = "Original Hiero format";
+ }
+
+ public HieroFormatReader() {
+ super();
+ }
+
+ public HieroFormatReader(String grammarFile) throws IOException {
+ super(grammarFile);
+ }
+
+ @Override
+ public Rule parseLine(String line) {
+ String[] fields = line.split(Constants.fieldDelimiter);
+ if (fields.length < 3) {
+ throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
+ }
+
+ int lhs = Vocabulary.id(fields[0]);
+
+ /**
+ * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
+ */
+ int arity = 0;
+ String[] sourceWords = fields[1].split("\\s+");
+ int[] sourceIDs = new int[sourceWords.length];
+ for (int i = 0; i < sourceWords.length; i++) {
+ /* NOTE: This redundantly creates vocab items for terms like [X,1]. This might actually
+ * be necessary, so don't try to turn this into an if/else.
+ */
+ sourceIDs[i] = Vocabulary.id(sourceWords[i]);
+ if (FormatUtils.isNonterminal(sourceWords[i])) {
+ sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
+ arity++;
+
+ // TODO: the arity here (after incrementing) should match the rule index. Should
+ // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
+ // error if not
+ }
+ }
+
+ /**
+ * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
+ * to the index of the source-side nonterminal they are linked to. So for a rule
+ *
+ * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
+ *
+ * the English side nonterminals will be -2, -1, -3. This assumes that the source side of
+ * the rule is always listed monotonically.
+ */
+ String[] targetWords = fields[2].split("\\s+");
+ int[] targetIDs = new int[targetWords.length];
+ for (int i = 0; i < targetWords.length; i++) {
+ targetIDs[i] = Vocabulary.id(targetWords[i]);
+ if (FormatUtils.isNonterminal(targetWords[i])) {
+ targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
+ }
+ }
+
+ String sparse_features = (fields.length > 3 ? fields[3] : "");
+ String alignment = (fields.length > 4) ? fields[4] : null;
+
+ return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
+ }
+
+ public static boolean isNonTerminal(final String word) {
+ return FormatUtils.isNonterminal(word);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
new file mode 100644
index 0000000..7811b3b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.io.LineReader;
+import org.apache.joshua.util.Constants;
+import org.apache.joshua.util.FormatUtils;
+
+/***
+ * This class reads in the Moses phrase table format, with support for the source and target side,
+ * list of features, and word alignments. It works by
+ *
+ * - casting the phrase-based rules to left-branching hierarchical rules and passing them on \
+ * to its parent class, {@link HieroFormatReader}.
+ * - converting the probabilities to -log probabilities
+ *
+ * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:
+ *
+ * <pre>
+ * cat PHRASE_TABLE | java -cp $JOSHUA/target/classes org.apache.joshua.decoder.ff.tm.format.MosesFormatReader
+ * </pre>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ *
+ */
+
+public class MosesFormatReader extends HieroFormatReader {
+
+ public MosesFormatReader(String grammarFile) throws IOException {
+ super(grammarFile);
+ Vocabulary.id(Constants.defaultNT);
+ }
+
+ public MosesFormatReader() {
+ super();
+ Vocabulary.id(Constants.defaultNT);
+ }
+
+ /**
+ * When dealing with Moses format, this munges a Moses-style phrase table into a grammar.
+ *
+ * mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * becomes
+ *
+ * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * For thrax-extracted phrasal grammars, it transforms
+ *
+ * [X] ||| mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * into
+ *
+ * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
+ */
+ @Override
+ public Rule parseLine(String line) {
+ String[] fields = line.split(Constants.fieldDelimiter);
+
+ String nt = FormatUtils.cleanNonTerminal(Constants.defaultNT);
+ StringBuffer hieroLine = new StringBuffer(Constants.defaultNT + " ||| [" + nt + ",1] " + fields[0] + " ||| [" + nt + ",1] " + fields[1] + " |||");
+
+ String mosesFeatureString = fields[2];
+ for (String value: mosesFeatureString.split(" ")) {
+ float f = Float.parseFloat(value);
+ hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
+ }
+
+ // alignments
+ if (fields.length >= 4)
+ hieroLine.append(" ||| " + fields[3]);
+
+ return super.parseLine(hieroLine.toString());
+ }
+
+ /**
+ * Converts a Moses phrase table to a Joshua grammar.
+ *
+ * @param args the command-line arguments
+ */
+ public static void main(String[] args) {
+ MosesFormatReader reader = new MosesFormatReader();
+ for (String line: new LineReader(System.in)) {
+ Rule rule = reader.parseLine(line);
+ System.out.println(rule.textFormat());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
new file mode 100644
index 0000000..ecb355d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.util.HashMap;
+import java.util.Iterator;
+
+public class ExtensionIterator implements Iterator<Integer> {
+
+ private Iterator<Integer> iterator;
+ private boolean terminal;
+ private boolean done;
+ private int next;
+
+ public ExtensionIterator(HashMap<Integer, ?> map, boolean terminal) {
+ this.terminal = terminal;
+ done = false;
+ if (map == null) {
+ done = true;
+ } else {
+ this.iterator = map.keySet().iterator();
+ forward();
+ }
+ }
+
+ private void forward() {
+ if (done)
+ return;
+ while (iterator.hasNext()) {
+ int candidate = iterator.next();
+ if ((terminal && candidate > 0) || (!terminal && candidate < 0)) {
+ next = candidate;
+ return;
+ }
+ }
+ done = true;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return !done;
+ }
+
+ @Override
+ public Integer next() {
+ if (done)
+ throw new RuntimeException();
+ int consumed = next;
+ forward();
+ return consumed;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
new file mode 100644
index 0000000..f346e7a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements a memory-based bilingual BatchGrammar.
+ * <p>
+ * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+ * in HashMap
+ *
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class MemoryBasedBatchGrammar extends AbstractGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
+
+ // ===============================================================
+ // Instance Fields
+ // ===============================================================
+
+ /* The number of rules read. */
+ private int qtyRulesRead = 0;
+
+ /* The number of distinct source sides. */
+ private int qtyRuleBins = 0;
+
+ private int numDenseFeatures = 0;
+
+ /* The trie root. */
+ private MemoryBasedTrie root = null;
+
+ /* The file containing the grammar. */
+ private String grammarFile;
+
+ private GrammarReader<Rule> modelReader;
+
+ // ===============================================================
+ // Static Fields
+ // ===============================================================
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
+ public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
+ super(joshuaConfiguration);
+ this.root = new MemoryBasedTrie();
+ this.joshuaConfiguration = joshuaConfiguration;
+ setSpanLimit(20);
+ }
+
+ public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
+ this(joshuaConfiguration);
+ this.owner = Vocabulary.id(owner);
+ }
+
+ public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
+ // this.defaultOwner = Vocabulary.id(defaultOwner);
+ // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
+ this(joshuaConfiguration);
+ modelReader = gr;
+ }
+
+ public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
+ String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
+ throws IOException {
+
+ this(joshuaConfiguration);
+ this.owner = Vocabulary.id(owner);
+ Vocabulary.id(defaultLHSSymbol);
+ this.spanLimit = spanLimit;
+ this.grammarFile = grammarFile;
+
+ // ==== loading grammar
+ this.modelReader = createReader(formatKeyword, grammarFile);
+ if (modelReader != null) {
+ for (Rule rule : modelReader)
+ if (rule != null) {
+ addRule(rule);
+ }
+ } else {
+ LOG.info("Couldn't create a GrammarReader for file {} with format {}",
+ grammarFile, formatKeyword);
+ }
+
+ this.printGrammar();
+ }
+
+ protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
+
+ if (grammarFile != null) {
+ if ("hiero".equals(format) || "thrax".equals(format)) {
+ return new HieroFormatReader(grammarFile);
+ } else if ("moses".equals(format)) {
+ return new MosesFormatReader(grammarFile);
+ } else {
+ throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
+ }
+ }
+ return null;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ public void setSpanLimit(int spanLimit) {
+ this.spanLimit = spanLimit;
+ }
+
+ @Override
+ public int getNumRules() {
+ return this.qtyRulesRead;
+ }
+
+ /**
+ * if the span covered by the chart bin is greater than the limit, then return false
+ */
+ public boolean hasRuleForSpan(int i, int j, int pathLength) {
+ if (this.spanLimit == -1) { // mono-glue grammar
+ return (i == 0);
+ } else {
+ // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+ // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+ return (pathLength <= this.spanLimit);
+ }
+ }
+
+ public Trie getTrieRoot() {
+ return this.root;
+ }
+
+ /**
+ * Adds a rule to the grammar.
+ */
+ public void addRule(Rule rule) {
+
+ // TODO: Why two increments?
+ this.qtyRulesRead++;
+
+ // if (owner == -1) {
+ // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
+ // System.exit(1);
+ // }
+ rule.setOwner(owner);
+
+ if (numDenseFeatures == 0)
+ numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
+
+ // === identify the position, and insert the trie nodes as necessary
+ MemoryBasedTrie pos = root;
+ int[] french = rule.getFrench();
+
+ maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+
+ for (int k = 0; k < french.length; k++) {
+ int curSymID = french[k];
+
+ /*
+ * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+ * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+ * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+ * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+ * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+ */
+
+ MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+ if (null == nextLayer) {
+ nextLayer = new MemoryBasedTrie();
+ if (pos.hasExtensions() == false) {
+ pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
+ }
+ pos.childrenTbl.put(curSymID, nextLayer);
+ }
+ pos = nextLayer;
+ }
+
+ // === add the rule into the trie node
+ if (!pos.hasRules()) {
+ pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
+ this.qtyRuleBins++;
+ }
+ pos.ruleBin.addRule(rule);
+ }
+
+ protected void printGrammar() {
+ LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+ this.qtyRulesRead, this.qtyRuleBins, grammarFile);
+ }
+
+ /***
+ * Takes an input word and creates an OOV rule in the current grammar for that word.
+ *
+ * @param sourceWord integer representation of word
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ @Override
+ public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
+
+ // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+ // almost
+ // certainly is)
+ final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
+ .word(sourceWord) + "_OOV") : sourceWord;
+
+ int[] sourceWords = { sourceWord };
+ int[] targetWords = { targetWord };
+ final String oovAlignment = "0-0";
+
+ if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
+ for (OOVItem item : this.joshuaConfiguration.oovList) {
+ Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
+ oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ } else {
+ int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
+ Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ /**
+ * Adds a default set of glue rules.
+ *
+ * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ String[] ruleStrings = new String[] {
+ String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+ Vocabulary.START_SYM),
+ String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT),
+ String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+ Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+ for (String ruleString : ruleStrings) {
+ Rule rule = reader.parseLine(ruleString);
+ addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ @Override
+ public int getNumDenseFeatures() {
+ return numDenseFeatures;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
new file mode 100644
index 0000000..f91df1e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import org.apache.joshua.decoder.ff.tm.BasicRuleCollection;
+import org.apache.joshua.decoder.ff.tm.Rule;
+
+/**
+ * Stores a collection of all rules with the same french side (and thus same arity).
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public class MemoryBasedRuleBin extends BasicRuleCollection {
+
+ /**
+ * Constructs an initially empty rule collection.
+ *
+ * @param arity Number of nonterminals in the source pattern
+ * @param sourceTokens Sequence of terminals and nonterminals in the source pattern
+ */
+ public MemoryBasedRuleBin(int arity, int[] sourceTokens) {
+ super(arity, sourceTokens);
+ }
+
+ /**
+ * Adds a rule to this collection.
+ *
+ * @param rule Rule to add to this collection.
+ */
+ public void addRule(Rule rule) {
+ // XXX This if clause seems bogus.
+ if (rules.size() <= 0) { // first time
+ this.arity = rule.getArity();
+ this.sourceTokens = rule.getFrench();
+ }
+ if (rule.getArity() != this.arity) {
+ return;
+ }
+ rules.add(rule);
+ sorted = false;
+ rule.setFrench(this.sourceTokens);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
new file mode 100644
index 0000000..998688a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.joshua.decoder.ff.tm.RuleCollection;
+import org.apache.joshua.decoder.ff.tm.Trie;
+
+/**
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public class MemoryBasedTrie implements Trie {
+ MemoryBasedRuleBin ruleBin = null;
+ HashMap<Integer, MemoryBasedTrie> childrenTbl = null;
+
+ public MemoryBasedTrie() {
+ }
+
+ @Override
+ public Trie match(int wordID) {
+ if (childrenTbl != null)
+ return childrenTbl.get(wordID);
+ return null;
+ }
+
+ /* See Javadoc for Trie interface. */
+ public boolean hasExtensions() {
+ return (null != this.childrenTbl);
+ }
+
+ public HashMap<Integer, MemoryBasedTrie> getChildren() {
+ return this.childrenTbl;
+ }
+
+ public void setExtensions(HashMap<Integer, MemoryBasedTrie> tbl_children_) {
+ this.childrenTbl = tbl_children_;
+ }
+
+ /* See Javadoc for Trie interface. */
+ public boolean hasRules() {
+ return (null != this.ruleBin);
+ }
+
+ public void setRuleBin(MemoryBasedRuleBin rb) {
+ ruleBin = rb;
+ }
+
+ /* See Javadoc for Trie interface. */
+ public RuleCollection getRuleCollection() {
+ return this.ruleBin;
+ }
+
+ /* See Javadoc for Trie interface. */
+ public Collection<MemoryBasedTrie> getExtensions() {
+ if (this.childrenTbl != null)
+ return this.childrenTbl.values();
+ return null;
+ }
+
+ @Override
+ public Iterator<Integer> getTerminalExtensionIterator() {
+ return new ExtensionIterator(childrenTbl, true);
+ }
+
+ @Override
+ public Iterator<Integer> getNonterminalExtensionIterator() {
+ return new ExtensionIterator(childrenTbl, false);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
new file mode 100644
index 0000000..695a0a4
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Provides implementations of hierarchical phrase-based translation grammars.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
new file mode 100644
index 0000000..b804db6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Defines interfaces and provides infrastructure for hierarchical
+ * phrase-based translation grammars.
+ */
+package org.apache.joshua.decoder.ff.tm;
+