You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:46:08 UTC
[57/60] incubator-joshua git commit: Merge branch
'maven-multi-module' of https://github.com/logogin/incubator-joshua into
maven-multi-module
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
index 0000000,0e5a4bc..15fbec1
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Rule.java
@@@ -1,0 -1,633 +1,635 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.tm;
+
++import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
++
+ import java.util.ArrayList;
+ import java.util.Arrays;
+ import java.util.Comparator;
+ import java.util.HashMap;
+ import java.util.List;
+ import java.util.Map;
+ import java.util.regex.Pattern;
+
+ import com.google.common.base.Supplier;
+ import com.google.common.base.Suppliers;
+
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
+ /**
+ * This class define the interface for Rule.
+ *
+ * All feature scores are interpreted as negative log probabilities, and are therefore negated.
+ * Note that not all features need to be negative log probs, but you should be aware that they
+ * will be negated, so if you want a positive count, it should come in as negative.
+ *
+ * Normally, the feature score in the rule should be *cost* (i.e., -LogP), so that the feature
+ * weight should be positive
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public class Rule implements Comparator<Rule>, Comparable<Rule> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Rule.class);
+ private int lhs; // tag of this rule
+ private int[] source; // pointer to the RuleCollection, as all the rules under it share the same
+ // Source side
+ protected int arity;
+
+ // And a string containing the sparse ones
+ //protected final String sparseFeatureString;
+ protected final Supplier<String> sparseFeatureStringSupplier;
+ private final Supplier<FeatureVector> featuresSupplier;
+
+ /*
+ * a feature function will be fired for this rule only if the owner of the rule matches the owner
+ * of the feature function
+ */
- private int owner = -1;
++ private OwnerId owner = UNKNOWN_OWNER_ID;
+
+ /**
+ * This is the cost computed only from the features present with the grammar rule. This cost is
+ * needed to sort the rules in the grammar for cube pruning, but isn't the full cost of applying
+ * the rule (which will include contextual features that can't be computed until the rule is
+ * applied).
+ */
+ private float estimatedCost = Float.NEGATIVE_INFINITY;
+
+ private float precomputableCost = Float.NEGATIVE_INFINITY;
+
+ private int[] target;
+
+ // The alignment string, e.g., 0-0 0-1 1-1 2-1
+ private String alignmentString;
+ private final Supplier<byte[]> alignmentSupplier;
+
+ /**
+ * Constructs a new rule using the provided parameters. Rule id for this rule is
+ * undefined. Note that some of the sparse features may be unlabeled, but they cannot be mapped to
+ * their default names ("tm_OWNER_INDEX") until later, when we know the owner of the rule. This is
+ * not known until the rule is actually added to a grammar in Grammar::addRule().
+ *
+ * Constructor used by other constructors below;
+ *
+ * @param lhs Left-hand side of the rule.
+ * @param source Source language right-hand side of the rule.
+ * @param target Target language right-hand side of the rule.
+ * @param sparseFeatures Feature value scores for the rule.
+ * @param arity Number of nonterminals in the source language right-hand side.
+ * @param owner todo
+ */
- public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, int owner) {
++ public Rule(int lhs, int[] source, int[] target, String sparseFeatures, int arity, OwnerId owner) {
+ this.lhs = lhs;
+ this.source = source;
+ this.arity = arity;
+ this.owner = owner;
+ this.target = target;
+ this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
+ this.featuresSupplier = initializeFeatureSupplierFromString();
+ this.alignmentSupplier = initializeAlignmentSupplier();
+ }
+
+ /**
+ * Constructor used by PackedGrammar's sortRules()
+ * @param lhs todo
+ * @param sourceRhs todo
+ * @param targetRhs todo
+ * @param features todo
+ * @param arity todo
+ * @param owner todo
+ */
- public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
++ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, OwnerId owner) {
+ this.lhs = lhs;
+ this.source = sourceRhs;
+ this.arity = arity;
+ this.owner = owner;
+ this.target = targetRhs;
+ this.featuresSupplier = Suppliers.memoize(() -> { return features; });
+ this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+ this.alignmentSupplier = initializeAlignmentSupplier();
+ }
+
+ /**
+ * Constructor used for SamtFormatReader and GrammarBuilderWalkerFunction's getRuleWithSpans()
- * Owner set to -1
++ * Rule is unowned.
+ * @param lhs todo
+ * @param sourceRhs todo
+ * @param targetRhs todo
+ * @param sparseFeatures todo
+ * @param arity todo
+ */
+ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) {
- this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1);
++ this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, OwnerMap.UNKNOWN_OWNER_ID);
+ }
+
+ /**
+ * Constructor used for addOOVRules(), HieroFormatReader and PhraseRule.
+ * @param lhs todo
+ * @param sourceRhs todo
+ * @param targetRhs todo
+ * @param sparseFeatures todo
+ * @param arity todo
+ * @param alignment todo
+ */
+ public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) {
+ this(lhs, sourceRhs, targetRhs, sparseFeatures, arity);
+ this.alignmentString = alignment;
+ }
+
+ /**
+ * Constructor (implicitly) used by PackedRule
+ */
+ public Rule() {
+ this.lhs = -1;
+ this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
+ this.featuresSupplier = initializeFeatureSupplierFromString();
+ this.alignmentSupplier = initializeAlignmentSupplier();
+ }
+
+ // ==========================================================================
+ // Lazy loading Suppliers for alignments, feature vector, and feature strings
+ // ==========================================================================
+
+ private Supplier<byte[]> initializeAlignmentSupplier(){
+ return Suppliers.memoize(() ->{
+ byte[] alignment = null;
+ String alignmentString = getAlignmentString();
+ if (alignmentString != null) {
+ String[] tokens = alignmentString.split("[-\\s]+");
+ alignment = new byte[tokens.length];
+ for (int i = 0; i < tokens.length; i++)
+ alignment[i] = (byte) Short.parseShort(tokens[i]);
+ }
+ return alignment;
+ });
+ }
+
+ /**
+ * If Rule was constructed with sparseFeatures String, we lazily populate the
+ * FeatureSupplier.
+ */
+ private Supplier<FeatureVector> initializeFeatureSupplierFromString(){
+ return Suppliers.memoize(() ->{
- if (owner != -1) {
- return new FeatureVector(getFeatureString(), "tm_" + Vocabulary.word(owner) + "_");
++ if (!owner.equals(UNKNOWN_OWNER_ID)) {
++ return new FeatureVector(getFeatureString(), "tm_" + OwnerMap.getOwner(owner) + "_");
+ } else {
+ return new FeatureVector();
+ }
+ });
+ }
+
+ /**
+ * If Rule was constructed with a FeatureVector, we lazily populate the sparseFeaturesStringSupplier.
+ */
+ private Supplier<String> initializeSparseFeaturesStringSupplier() {
+ return Suppliers.memoize(() -> {
+ return getFeatureVector().toString();
+ });
+ }
+
+ // ===============================================================
+ // Attributes
+ // ===============================================================
+
+ public void setEnglish(int[] eng) {
+ this.target = eng;
+ }
+
+ public int[] getEnglish() {
+ return this.target;
+ }
+
+ /**
+ * Two Rules are equal of they have the same LHS, the same source RHS and the same target
+ * RHS.
+ *
+ * @param o the object to check for equality
+ * @return true if o is the same Rule as this rule, false otherwise
+ */
+ public boolean equals(Object o) {
+ if (!(o instanceof Rule)) {
+ return false;
+ }
+ Rule other = (Rule) o;
+ if (getLHS() != other.getLHS()) {
+ return false;
+ }
+ if (!Arrays.equals(getFrench(), other.getFrench())) {
+ return false;
+ }
+ if (!Arrays.equals(target, other.getEnglish())) {
+ return false;
+ }
+ return true;
+ }
+
+ public int hashCode() {
+ // I just made this up. If two rules are equal they'll have the
+ // same hashcode. Maybe someone else can do a better job though?
+ int frHash = Arrays.hashCode(getFrench());
+ int enHash = Arrays.hashCode(target);
+ return frHash ^ enHash ^ getLHS();
+ }
+
+ // ===============================================================
+ // Attributes
+ // ===============================================================
+
+ public void setArity(int arity) {
+ this.arity = arity;
+ }
+
+ public int getArity() {
+ return this.arity;
+ }
+
- public void setOwner(int owner) {
++ public void setOwner(final OwnerId owner) {
+ this.owner = owner;
+ }
+
- public int getOwner() {
++ public OwnerId getOwner() {
+ return this.owner;
+ }
+
+ public void setLHS(int lhs) {
+ this.lhs = lhs;
+ }
+
+ public int getLHS() {
+ return this.lhs;
+ }
+
+ public void setFrench(int[] french) {
+ this.source = french;
+ }
+
+ public int[] getFrench() {
+ return this.source;
+ }
+
+ /**
+ * This function does the work of turning the string version of the sparse features (passed in
+ * when the rule was created) into an actual set of features. This is a bit complicated because we
+ * support intermingled labeled and unlabeled features, where the unlabeled features are mapped to
+ * a default name template of the form "tm_OWNER_INDEX".
+ *
+ * This function returns the dense (phrasal) features discovered when the rule was loaded. Dense
+ * features are the list of unlabeled features that preceded labeled ones. They can also be
+ * specified as labeled features of the form "tm_OWNER_INDEX", but the former format is preferred.
+ *
+ * @return the {@link org.apache.joshua.decoder.ff.FeatureVector} for this rule
+ */
+ public FeatureVector getFeatureVector() {
+ return featuresSupplier.get();
+ }
+
+ /**
+ * This function returns the estimated cost of a rule, which should have been computed when the
+ * grammar was first sorted via a call to Rule::estimateRuleCost(). This function is a getter
+ * only; it will not compute the value if it has not already been set. It is necessary in addition
+ * to estimateRuleCost(models) because sometimes the value needs to be retrieved from contexts
+ * that do not have access to the feature functions.
+ *
+ * This function is called by the rule comparator when sorting the grammar. As such it may be
+ * called many times and any implementation of it should be a cached implementation.
+ *
+ * @return the estimated cost of the rule (a lower bound on the true cost)
+ */
+ public float getEstimatedCost() {
+ return estimatedCost;
+ }
+
+ /**
+ * Precomputable costs is the inner product of the weights found on each grammar rule and the
+ * weight vector. This is slightly different from the estimated rule cost, which can include other
+ * features (such as a language model estimate). This getter and setter should also be cached, and
+ * is basically provided to allow the PhraseModel feature to cache its (expensive) computation for
+ * each rule.
+ *
+ * The weights are passed in as dense weights and sparse weights. This allows the dense weight
+ * precomputation to be even faster (since we don't have to query a hash map.
+ *
+ * @param dense_weights the dense weights from the model
+ * @param weights the sparse weights from the model
+ */
+ public void setPrecomputableCost(float[] dense_weights, FeatureVector weights) {
+ float cost = 0.0f;
+ FeatureVector features = getFeatureVector();
+ for (int i = 0; i < features.getDenseFeatures().size() && i < dense_weights.length; i++) {
+ cost += dense_weights[i] * features.getDense(i);
+ }
+
+ for (String key: features.getSparseFeatures().keySet()) {
+ cost += weights.getSparse(key) * features.getSparse(key);
+ }
+
+ this.precomputableCost = cost;
+ }
+
+ /**
+ * @return the precomputed model cost of each rule
+ */
+ public float getPrecomputableCost() {
+ return precomputableCost;
+ }
+
+ public float getDenseFeature(int k) {
+ return getFeatureVector().getDense(k);
+ }
+
+ /**
+ * This function estimates the cost of a rule, which is used for sorting the rules for cube
+ * pruning. The estimated cost is basically the set of precomputable features (features listed
+ * along with the rule in the grammar file) along with any other estimates that other features
+ * would like to contribute (e.g., a language model estimate). This cost will be a lower bound on
+ * the rule's actual cost.
+ *
+ * The value of this function is used only for sorting the rules. When the rule is later applied
+ * in context to particular hypernodes, the rule's actual cost is computed.
+ *
+ * @param models the list of models available to the decoder
+ * @return estimated cost of the rule
+ */
+ public float estimateRuleCost(List<FeatureFunction> models) {
+ if (null == models)
+ return 0.0f;
+
+ if (this.estimatedCost <= Float.NEGATIVE_INFINITY) {
+ this.estimatedCost = 0.0f; // weights.innerProduct(computeFeatures());
+
+ LOG.debug("estimateCost({} ;; {})", getFrenchWords(), getEnglishWords());
+ for (FeatureFunction ff : models) {
+ float val = ff.estimateCost(this, null);
+ LOG.debug(" FEATURE {} -> {}", ff.getName(), val);
+ this.estimatedCost += val;
+ }
+ }
+
+ return estimatedCost;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append(Vocabulary.word(this.getLHS()));
+ sb.append(" ||| ");
+ sb.append(getFrenchWords());
+ sb.append(" ||| ");
+ sb.append(getEnglishWords());
+ sb.append(" |||");
+ sb.append(" " + getFeatureVector());
+ sb.append(String.format(" ||| est=%.3f", getEstimatedCost()));
+ sb.append(String.format(" pre=%.3f", getPrecomputableCost()));
+ return sb.toString();
+ }
+
+ /**
+ * Returns a version of the rule suitable for reading in from a text file.
+ *
+ * @return string version of the rule
+ */
+ public String textFormat() {
+ StringBuffer sb = new StringBuffer();
+ sb.append(Vocabulary.word(this.getLHS()));
+ sb.append(" |||");
+
+ int nt = 1;
+ for (int i = 0; i < getFrench().length; i++) {
+ if (getFrench()[i] < 0)
+ sb.append(" " + Vocabulary.word(getFrench()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+ else
+ sb.append(" " + Vocabulary.word(getFrench()[i]));
+ }
+ sb.append(" |||");
+ nt = 1;
+ for (int i = 0; i < getEnglish().length; i++) {
+ if (getEnglish()[i] < 0)
+ sb.append(" " + Vocabulary.word(getEnglish()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
+ else
+ sb.append(" " + Vocabulary.word(getEnglish()[i]));
+ }
+ sb.append(" |||");
+ sb.append(" " + getFeatureString());
+ if (getAlignmentString() != null)
+ sb.append(" ||| " + getAlignmentString());
+ return sb.toString();
+ }
+
+ public String getFeatureString() {
+ return sparseFeatureStringSupplier.get();
+ }
+
+ /**
+ * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired,
+ * with position i indexing the source and i+1 the target.
+ *
+ * @return a byte[] from the {@link com.google.common.base.Supplier}
+ */
+ public byte[] getAlignment() {
+ return this.alignmentSupplier.get();
+ }
+
+ public String getAlignmentString() {
+ return this.alignmentString;
+ }
+
+ /**
+ * The nonterminals on the English side are pointers to the source side nonterminals (-1 and -2),
+ * rather than being directly encoded. These number indicate the correspondence between the
+ * nonterminals on each side, introducing a level of indirection however when we want to resolve
+ * them. So to get the ID, we need to look up the corresponding source side ID.
+ *
+ * @return The string of English words
+ */
+ public String getEnglishWords() {
+ int[] foreignNTs = getForeignNonTerminals();
+
+ StringBuilder sb = new StringBuilder();
+ for (Integer index : getEnglish()) {
+ if (index >= 0)
+ sb.append(Vocabulary.word(index) + " ");
+ else
+ sb.append(Vocabulary.word(foreignNTs[-index - 1]).replace("]",
+ String.format(",%d] ", Math.abs(index))));
+ }
+
+ return sb.toString().trim();
+ }
+
+ public boolean isTerminal() {
+ for (int i = 0; i < getEnglish().length; i++)
+ if (getEnglish()[i] < 0)
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Return the French (source) nonterminals as list of Strings
+ *
+ * @return a list of strings
+ */
+ public int[] getForeignNonTerminals() {
+ int[] nts = new int[getArity()];
+ int index = 0;
+ for (int id : getFrench())
+ if (id < 0)
+ nts[index++] = -id;
+ return nts;
+ }
+
+ /**
+ * Returns an array of size getArity() containing the source indeces of non terminals.
+ *
+ * @return an array of size getArity() containing the source indeces of non terminals
+ */
+ public int[] getNonTerminalSourcePositions() {
+ int[] nonTerminalPositions = new int[getArity()];
+ int ntPos = 0;
+ for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) {
+ if (getFrench()[sourceIdx] < 0)
+ nonTerminalPositions[ntPos++] = sourceIdx;
+ }
+ return nonTerminalPositions;
+ }
+
+ /**
+ * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions.
+ * Used by the WordAlignmentExtractor.
+ *
+ * @return a {@link java.util.Map} of alignments
+ */
+ public Map<Integer, List<Integer>> getAlignmentMap() {
+ byte[] alignmentArray = getAlignment();
+ Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, List<Integer>>();
+ if (alignmentArray != null) {
+ for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; alignmentIdx += 2 ) {
+ int s = alignmentArray[alignmentIdx];
+ int t = alignmentArray[alignmentIdx + 1];
+ List<Integer> values = alignmentMap.get(t);
+ if (values == null)
+ alignmentMap.put(t, values = new ArrayList<Integer>());
+ values.add(s);
+ }
+ }
+ return alignmentMap;
+ }
+
+ /**
+ * Return the English (target) nonterminals as list of Strings
+ *
+ * @return list of strings
+ */
+ public int[] getEnglishNonTerminals() {
+ int[] nts = new int[getArity()];
+ int[] foreignNTs = getForeignNonTerminals();
+ int index = 0;
+
+ for (int i : getEnglish()) {
+ if (i < 0)
+ nts[index++] = foreignNTs[Math.abs(getEnglish()[i]) - 1];
+ }
+
+ return nts;
+ }
+
+ private int[] getNormalizedEnglishNonterminalIndices() {
+ int[] result = new int[getArity()];
+
+ int ntIndex = 0;
+ for (Integer index : getEnglish()) {
+ if (index < 0)
+ result[ntIndex++] = -index - 1;
+ }
+
+ return result;
+ }
+
+ public boolean isInverting() {
+ int[] normalizedEnglishNonTerminalIndices = getNormalizedEnglishNonterminalIndices();
+ if (normalizedEnglishNonTerminalIndices.length == 2) {
+ if (normalizedEnglishNonTerminalIndices[0] == 1) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public String getFrenchWords() {
+ return Vocabulary.getWords(getFrench());
+ }
+
+ public static final String NT_REGEX = "\\[[^\\]]+?\\]";
+
+ private Pattern getPattern() {
+ String source = getFrenchWords();
+ String pattern = Pattern.quote(source);
+ pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
+ pattern = pattern.replaceAll("\\\\Q\\\\E", "");
+ pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
+ return Pattern.compile(pattern);
+ }
+
+ /**
+ * Matches the string representation of the rule's source side against a sentence
+ *
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return true if there is a match
+ */
+ public boolean matches(Sentence sentence) {
+ boolean match = getPattern().matcher(sentence.fullSource()).find();
+ // System.err.println(String.format("match(%s,%s) = %s", Pattern.quote(getFrenchWords()),
+ // sentence.annotatedSource(), match));
+ return match;
+ }
+
+ /**
+ * This comparator is used for sorting the rules during cube pruning. An estimate of the cost
+ * of each rule is computed and used to sort.
+ */
+ public static Comparator<Rule> EstimatedCostComparator = new Comparator<Rule>() {
+ public int compare(Rule rule1, Rule rule2) {
+ float cost1 = rule1.getEstimatedCost();
+ float cost2 = rule2.getEstimatedCost();
+ return Float.compare(cost2, cost1);
+ }
+ };
+
+ public int compare(Rule rule1, Rule rule2) {
+ return EstimatedCostComparator.compare(rule1, rule2);
+ }
+
+ public int compareTo(Rule other) {
+ return EstimatedCostComparator.compare(this, other);
+ }
+
+ public String getRuleString() {
+ return String.format("%s -> %s ||| %s", Vocabulary.word(getLHS()), getFrenchWords(), getEnglishWords());
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
index 0000000,c952b05..54f68b2
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
@@@ -1,0 -1,366 +1,366 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.tm;
+
+ import java.util.Collection;
+ import java.util.HashMap;
+ import java.util.Iterator;
+ import java.util.Map.Entry;
+
+ import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+ import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
+ /**
+ * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
+ * trie, a subset of the original trie, that only contains trie paths that are reachable from
+ * traversals of the current sentence.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
+
+ private AbstractGrammar baseGrammar;
+ private SentenceFilteredTrie filteredTrie;
+ private int[] tokens;
+ private Sentence sentence;
+
+ /**
+ * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
+ * from the base grammar, which contains the complete grammar).
+ *
+ * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ */
+ SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
- super(baseGrammar.joshuaConfiguration);
++ super(OwnerMap.getOwner(baseGrammar.getOwner()), baseGrammar.joshuaConfiguration, baseGrammar.getSpanLimit());
+ this.baseGrammar = baseGrammar;
+ this.sentence = sentence;
+ this.tokens = sentence.getWordIDs();
+
+ int origCount = getNumRules(baseGrammar.getTrieRoot());
+ long startTime = System.currentTimeMillis();
+
+ /* Filter the rules; returns non-null object */
+ this.filteredTrie = filter(baseGrammar.getTrieRoot());
+ int filteredCount = getNumRules();
+
+ float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
+
+ LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
+ sentence.id(), origCount, filteredCount, seconds);
+ }
+
+ @Override
+ public Trie getTrieRoot() {
+ return filteredTrie;
+ }
+
+ /**
+ * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
+ * current span, but whether the grammar is permitted to apply rules to the current span (a
+ * grammar-level parameter). As such we can just chain to the underlying grammar.
+ */
+ @Override
+ public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
+ return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
+ }
+
+ @Override
+ public int getNumRules() {
+ return getNumRules(getTrieRoot());
+ }
+
+ /**
+ * A convenience function that counts the number of rules in a grammar's trie.
+ *
+ * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
+ * @return the number of rules
+ */
+ public int getNumRules(Trie node) {
+ int numRules = 0;
+ if (node != null) {
+ if (node.getRuleCollection() != null)
+ numRules += node.getRuleCollection().getRules().size();
+
+ if (node.getExtensions() != null)
+ for (Trie child : node.getExtensions())
+ numRules += getNumRules(child);
+ }
+
+ return numRules;
+ }
+
+ /**
+ * What is the algorithm?
+ *
+ * Take the first word of the sentence, and start at the root of the trie. There are two things to
+ * consider: (a) word matches and (b) nonterminal matches.
+ *
+ * For a word match, simply follow that arc along the trie. We create a parallel arc in our
+ * filtered grammar to represent it. Each arc in the filtered trie knows about its
+ * corresponding/underlying node in the unfiltered grammar trie.
+ *
+ * A nonterminal is always permitted to match. The question then is how much of the input sentence
+ * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
+ * has to be a set of calls, one each to the next trie node with different lengths of the sentence
+ * remaining.
+ *
+ * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
+ * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
+ * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
+ * subsequence, because with respect to filtering rules, they are all the same.
+ *
+ * We accomplish this with the following restriction: for purposes of grammar filtering, only the
+ * first in a sequence of nonterminal traversals can consume more than one word. Each of the
+ * subsequent ones would have to consume just one word. We then just have to record in the
+ * recursive call whether the last traversal was a nonterminal or not.
+ *
+ * @param unfilteredTrieRoot todo
+ * @return the root of the filtered trie
+ */
+ private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
+ SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
+
+ // System.err.println(String.format("FILTERING TO SENTENCE\n %s\n",
+ // Vocabulary.getWords(tokens)));
+
+ /*
+ * The root of the trie is where rule applications start, so we simply try all possible
+ * positions in the sentence.
+ */
+ for (int i = 0; i < tokens.length; i++) {
+ filter(i, filteredTrieRoot, false);
+ }
+
+ return filteredTrieRoot;
+ }
+
+ /**
+ * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
+ * Marks arcs that are traversable for this sentence.
+ *
+ * @param i the position in the sentence to start matching
+ * @param trie the trie node to match against
+ * @param lastWasNT true if the match that brought us here was against a nonterminal
+ */
+ private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
+ if (i >= tokens.length)
+ return;
+
+ /* Make sure the underlying unfiltered node has children. */
+ Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
+ if (unfilteredTrieNode.getChildren() == null) {
+ // trieNode.path.retreat();
+ return;
+ }
+
+ /* Match a word */
+ Trie trie = unfilteredTrieNode.match(tokens[i]);
+ if (trie != null) {
+ /*
+ * The current filtered node might already have an arc for this label. If so, retrieve it
+ * (since we still need to follow it); if not, create it.
+ */
+ SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
+ if (nextFilteredTrie == null) {
+ nextFilteredTrie = new SentenceFilteredTrie(trie);
+ trieNode.children.put(tokens[i], nextFilteredTrie);
+ }
+
+ /*
+ * Now continue, trying to match the child node against the next position in the sentence. The
+ * third argument records that this match was not against a nonterminal.
+ */
+ filter(i + 1, nextFilteredTrie, false);
+ }
+
+ /*
+ * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
+ * sentence, up to the maximum span for that grammar. So we enumerate all children of the
+ * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
+ * less than 0), then recurse.
+ *
+ * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
+ * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
+ * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
+ * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
+ * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
+ */
+ HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
+ if (children != null) {
+ for (int label : children.keySet()) {
+ if (label < 0) {
+ SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
+ if (nextFilteredTrie == null) {
+ nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
+ trieNode.children.put(label, nextFilteredTrie);
+ }
+
+ /*
+ * Recurse. If the last match was a nonterminal, we can only consume one more token.
+ *
+ * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
+ * span limit which should be consulted. What we should be doing is passing the point
+ * where we started matching the current sentence, so we can apply this span limit, which
+ * is easily accessible (baseGrammar.spanLimit).
+ */
+ int maxJ = lastWasNT ? (i + 1) : tokens.length;
+ for (int j = i + 1; j <= maxJ; j++) {
+ filter(j, nextFilteredTrie, true);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Alternate filter that uses regular expressions, walking the grammar trie and matching the
+ * source side of each rule collection against the input sentence. Failed matches are discarded,
+ * and trie nodes extending from that position need not be explored.
+ *
+ * @param unfilteredTrie todo
+ * @return the root of the filtered trie if any rules were retained, otherwise null
+ */
+ @SuppressWarnings("unused")
+ private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
+ SentenceFilteredTrie trie = null;
+
+ /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
+ if (unfilteredTrie.hasRules())
+ if (matchesSentence(unfilteredTrie))
+ trie = new SentenceFilteredTrie(unfilteredTrie);
+ else
+ return null;
+
+ /* Case 2: keep the trie node if it has children who have valid rule collections */
+ if (unfilteredTrie.hasExtensions())
+ for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
+ Trie unfilteredChildTrie = arc.getValue();
+ SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
+ if (nextTrie != null) {
+ if (trie == null)
+ trie = new SentenceFilteredTrie(unfilteredTrie);
+ trie.children.put(arc.getKey(), nextTrie);
+ }
+ }
+
+ return trie;
+ }
+
+ private boolean matchesSentence(Trie childTrie) {
+ Rule rule = childTrie.getRuleCollection().getRules().get(0);
+ return rule.matches(sentence);
+ }
+
+ /**
+ * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
+ * the given input sentence.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ *
+ */
+ public class SentenceFilteredTrie implements Trie {
+
+ /* The underlying unfiltered trie node. */
+ private Trie unfilteredTrieNode;
+
+ /* The child nodes in the filtered trie. */
+ private HashMap<Integer, SentenceFilteredTrie> children = null;
+
+ /**
+ * Constructor.
+ *
+ * @param unfilteredTrieNode todo
+ */
+ public SentenceFilteredTrie(Trie unfilteredTrieNode) {
+ this.unfilteredTrieNode = unfilteredTrieNode;
+ this.children = new HashMap<Integer, SentenceFilteredTrie>();
+ }
+
+ @Override
+ public SentenceFilteredTrie match(int wordID) {
+ if (children != null)
+ return children.get(wordID);
+ return null;
+ }
+
+ @Override
+ public boolean hasExtensions() {
+ return children != null;
+ }
+
+ @Override
+ public Collection<SentenceFilteredTrie> getExtensions() {
+ if (children != null)
+ return children.values();
+
+ return null;
+ }
+
+ @Override
+ public HashMap<Integer, SentenceFilteredTrie> getChildren() {
+ return children;
+ }
+
+ @Override
+ public boolean hasRules() {
+ // Chain to the underlying unfiltered node.
+ return unfilteredTrieNode.hasRules();
+ }
+
+ @Override
+ public RuleCollection getRuleCollection() {
+ // Chain to the underlying unfiltered node, since the rule collection just varies by target
+ // side.
+ return unfilteredTrieNode.getRuleCollection();
+ }
+
+ /**
+ * Counts the number of rules.
+ *
+ * @return the number of rules rooted at this node.
+ */
+ public int getNumRules() {
+ int numRules = 0;
+ if (getTrieRoot() != null)
+ if (getTrieRoot().getRuleCollection() != null)
+ numRules += getTrieRoot().getRuleCollection().getRules().size();
+
+ for (SentenceFilteredTrie node : getExtensions())
+ numRules += node.getNumRules();
+
+ return numRules;
+ }
+
+ @Override
+ public Iterator<Integer> getTerminalExtensionIterator() {
+ return new ExtensionIterator(children, true);
+ }
+
+ @Override
+ public Iterator<Integer> getNonterminalExtensionIterator() {
+ return new ExtensionIterator(children, false);
+ }
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index 0000000,f346e7a..365102b
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@@ -1,0 -1,292 +1,279 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.tm.hash_based;
+
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.HashMap;
+ import java.util.List;
+
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.ff.tm.GrammarReader;
+ import org.apache.joshua.decoder.ff.tm.Trie;
+ import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+ import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+ import org.apache.joshua.util.FormatUtils;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
+ /**
+ * This class implements a memory-based bilingual BatchGrammar.
+ * <p>
+ * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+ * in HashMap
+ *
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public class MemoryBasedBatchGrammar extends AbstractGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
+
+ // ===============================================================
+ // Instance Fields
+ // ===============================================================
+
+ /* The number of rules read. */
+ private int qtyRulesRead = 0;
+
+ /* The number of distinct source sides. */
+ private int qtyRuleBins = 0;
+
+ private int numDenseFeatures = 0;
+
+ /* The trie root. */
- private MemoryBasedTrie root = null;
++ private MemoryBasedTrie root = new MemoryBasedTrie();
+
+ /* The file containing the grammar. */
+ private String grammarFile;
+
+ private GrammarReader<Rule> modelReader;
+
+ // ===============================================================
+ // Static Fields
+ // ===============================================================
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
- public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
- super(joshuaConfiguration);
- this.root = new MemoryBasedTrie();
- this.joshuaConfiguration = joshuaConfiguration;
- setSpanLimit(20);
- }
-
- public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
- this(joshuaConfiguration);
- this.owner = Vocabulary.id(owner);
++ /**
++ * Constructor used by Decoder mostly. Default spanLimit of 20
++ */
++ public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration config, int spanLimit) {
++ super(owner, config, spanLimit);
+ }
+
- public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
- // this.defaultOwner = Vocabulary.id(defaultOwner);
- // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
- this(joshuaConfiguration);
- modelReader = gr;
++ /**
++ * Constructor to initialize a GrammarReader (unowned)
++ */
++ public MemoryBasedBatchGrammar(
++ final GrammarReader<Rule> reader, final JoshuaConfiguration config, final int spanLimit) {
++ super(OwnerMap.UNKNOWN_OWNER, config, spanLimit);
++ modelReader = reader;
+ }
+
+ public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
+ String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
+ throws IOException {
+
- this(joshuaConfiguration);
- this.owner = Vocabulary.id(owner);
++ super(owner, joshuaConfiguration, spanLimit);
+ Vocabulary.id(defaultLHSSymbol);
- this.spanLimit = spanLimit;
+ this.grammarFile = grammarFile;
+
+ // ==== loading grammar
+ this.modelReader = createReader(formatKeyword, grammarFile);
+ if (modelReader != null) {
+ for (Rule rule : modelReader)
+ if (rule != null) {
+ addRule(rule);
+ }
+ } else {
+ LOG.info("Couldn't create a GrammarReader for file {} with format {}",
+ grammarFile, formatKeyword);
+ }
+
+ this.printGrammar();
+ }
+
+ protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
+
+ if (grammarFile != null) {
+ if ("hiero".equals(format) || "thrax".equals(format)) {
+ return new HieroFormatReader(grammarFile);
+ } else if ("moses".equals(format)) {
+ return new MosesFormatReader(grammarFile);
+ } else {
+ throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
+ }
+ }
+ return null;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
- public void setSpanLimit(int spanLimit) {
- this.spanLimit = spanLimit;
- }
-
+ @Override
+ public int getNumRules() {
+ return this.qtyRulesRead;
+ }
+
+ /**
+ * if the span covered by the chart bin is greater than the limit, then return false
+ */
+ public boolean hasRuleForSpan(int i, int j, int pathLength) {
+ if (this.spanLimit == -1) { // mono-glue grammar
+ return (i == 0);
+ } else {
+ // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+ // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+ return (pathLength <= this.spanLimit);
+ }
+ }
+
+ public Trie getTrieRoot() {
+ return this.root;
+ }
+
+ /**
+ * Adds a rule to the grammar.
+ */
+ public void addRule(Rule rule) {
+
- // TODO: Why two increments?
+ this.qtyRulesRead++;
+
- // if (owner == -1) {
- // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
- // System.exit(1);
- // }
+ rule.setOwner(owner);
+
+ if (numDenseFeatures == 0)
+ numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
+
+ // === identify the position, and insert the trie nodes as necessary
+ MemoryBasedTrie pos = root;
+ int[] french = rule.getFrench();
+
+ maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+
+ for (int k = 0; k < french.length; k++) {
+ int curSymID = french[k];
+
+ /*
+ * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+ * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+ * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+ * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+ * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+ */
+
+ MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+ if (null == nextLayer) {
+ nextLayer = new MemoryBasedTrie();
+ if (pos.hasExtensions() == false) {
+ pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
+ }
+ pos.childrenTbl.put(curSymID, nextLayer);
+ }
+ pos = nextLayer;
+ }
+
+ // === add the rule into the trie node
+ if (!pos.hasRules()) {
+ pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
+ this.qtyRuleBins++;
+ }
+ pos.ruleBin.addRule(rule);
+ }
+
+ protected void printGrammar() {
+ LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+ this.qtyRulesRead, this.qtyRuleBins, grammarFile);
+ }
+
+ /***
+ * Takes an input word and creates an OOV rule in the current grammar for that word.
+ *
+ * @param sourceWord integer representation of word
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ @Override
+ public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
+
+ // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+ // almost
+ // certainly is)
+ final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
+ .word(sourceWord) + "_OOV") : sourceWord;
+
+ int[] sourceWords = { sourceWord };
+ int[] targetWords = { targetWord };
+ final String oovAlignment = "0-0";
+
+ if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
+ for (OOVItem item : this.joshuaConfiguration.oovList) {
+ Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
+ oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ } else {
+ int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
+ Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ /**
+ * Adds a default set of glue rules.
+ *
+ * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ String[] ruleStrings = new String[] {
+ String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+ Vocabulary.START_SYM),
+ String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT),
+ String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+ Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+ for (String ruleString : ruleStrings) {
+ Rule rule = reader.parseLine(ruleString);
+ addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ @Override
+ public int getNumDenseFeatures() {
+ return numDenseFeatures;
+ }
+ }