You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/23 22:17:54 UTC
[37/50] [abbrv] incubator-joshua git commit: Merge branch 'master'
into 7
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
index fb8c789,0000000..802aadd
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
@@@ -1,339 -1,0 +1,339 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * <p>This class defines Joshua's feature function interface, for both sparse and
+ * dense features. It is immediately inherited by StatelessFF and StatefulFF,
+ * which provide functionality common to stateless and stateful features,
+ * respectively. Any feature implementation should extend those classes, and not
+ * this one. The distinction between stateless and stateful features is somewhat
+ * narrow: all features have the opportunity to return an instance of a
+ * {@link DPState} object, and stateless ones just return null.</p>
+ *
+ * <p>Features in Joshua work like templates. Each feature function defines any
+ * number of actual features, which are associated with weights. The task of the
+ * feature function is to compute the features that are fired in different
+ * circumstances and then return the inner product of those features with the
+ * weight vector. Feature functions can also produce estimates of their future
+ * cost (via {@link org.apache.joshua.decoder.ff.FeatureFunction#estimateCost(Rule, Sentence)});
+ * these values are not used in computing the
+ * score, but are only used for sorting rules during cube pruning. The
+ * individual features produced by each template should have globally unique
+ * names; a good convention is to prefix each feature with the name of the
+ * template that produced it.</p>
+ *
+ * <p>Joshua does not retain individual feature values while decoding, since this
+ * requires keeping a sparse feature vector along every hyperedge, which can be
+ * expensive. Instead, it computes only the weighted cost of each edge. If the
+ * individual feature values are requested, the feature functions are replayed
+ * in post-processing, say during k-best list extraction. This is implemented in
+ * a generic way by passing an {@link Accumulator} object to the compute()
+ * function. During decoding, the accumulator simply sums weighted features in a
+ * scalar. During k-best extraction, when individual feature values are needed,
+ * a {@link FeatureAccumulator} is used to retain the individual values.</p>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevich juri@cs.jhu.edu
+ */
+public abstract class FeatureFunction {
+
+ /*
+ * The name of the feature function; this generally matches the weight name on
+ * the config file. This can also be used as a prefix for feature / weight
+ * names, for templates that define multiple features.
+ */
+ protected String name = null;
+
+ /*
+ * The hashed feature id correspondig to name. This can be changed if name is changed as well
+ * but provides a good default id for most cases.
+ */
+ protected int featureId;
+
+ // The list of arguments passed to the feature, and the hash for the parsed args
+ protected final String[] args;
+ protected final HashMap<String, String> parsedArgs;
+
+ /*
+ * The global weight vector used by the decoder, passed it when the feature is
+ * instantiated
+ */
+ protected final FeatureVector weights;
+
+ /* The config */
- protected JoshuaConfiguration config;
++ protected final JoshuaConfiguration config;
+
+ public String getName() {
+ return name;
+ }
+
+ // Whether the feature has state.
+ public abstract boolean isStateful();
+
+ public FeatureFunction(FeatureVector weights, String name, String[] args, JoshuaConfiguration config) {
+ this.weights = weights;
+ this.name = name;
+ this.featureId = FeatureMap.hashFeature(this.name);
+ this.args = args;
+ this.config = config;
+ this.parsedArgs = FeatureFunction.parseArgs(args);
+ }
+
+ public String logString() {
+ return String.format("%s (weight %.3f)", name, weights.getOrDefault(hashFeature(name)));
+ }
+
+ /**
+ * This is the main function for defining feature values. The implementor
+ * should compute all the features along the hyperedge, calling
+ * {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator#add(String, float)}
+ * for each feature. It then returns the newly-computed dynamic
+ * programming state for this feature (for example, for the
+ * {@link org.apache.joshua.decoder.ff.lm.LanguageModelFF} feature, this returns the new language model
+ * context). For stateless features, this value is null.
+ *
+ * Note that the accumulator accumulates *unweighted* feature values. The
+ * feature vector is multiplied times the weight vector later on.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the new dynamic programming state (null for stateless features)
+ */
+ public abstract DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence, Accumulator acc);
+
+ /**
+ * Feature functions must overrided this. StatefulFF and StatelessFF provide
+ * reasonable defaults since most features do not fire on the goal node.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the DPState (null if none)
+ */
+ public abstract DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc);
+
+ /**
+ * This is a convenience function for retrieving the features fired when
+ * applying a rule, provided for backward compatibility.
+ *
+ * Returns the *unweighted* cost of the features delta computed at this
+ * position. Note that this is a feature delta, so existing feature costs of
+ * the tail nodes should not be incorporated, and it is very important not to
+ * incorporate the feature weights. This function is used in the kbest
+ * extraction code but could also be used in computing the cost.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return an *unweighted* feature delta
+ */
+ public final FeatureVector computeFeatures(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence) {
+
+ FeatureAccumulator features = new FeatureAccumulator();
+ compute(rule, tailNodes, i, j, sourcePath, sentence, features);
+ return features.getFeatures();
+ }
+
+ /**
+ * This function is called for the final transition. For example, the
+ * LanguageModel feature function treats the last rule specially. It needs to
+ * return the *weighted* cost of applying the feature. Provided for backward
+ * compatibility.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return a *weighted* feature cost
+ */
+ public final float computeFinalCost(HGNode tailNode, int i, int j, SourcePath sourcePath,
+ Sentence sentence) {
+
+ ScoreAccumulator score = new ScoreAccumulator();
+ computeFinal(tailNode, i, j, sourcePath, sentence, score);
+ return score.getScore();
+ }
+
+ /**
+ * Returns the *unweighted* feature delta for the final transition (e.g., for
+ * the language model feature function). Provided for backward compatibility.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return an *weighted* feature vector
+ */
+ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j,
+ SourcePath sourcePath, Sentence sentence) {
+
+ FeatureAccumulator features = new FeatureAccumulator();
+ computeFinal(tailNode, i, j, sourcePath, sentence, features);
+ return features.getFeatures();
+ }
+
+ /**
+ * This function is called when sorting rules for cube pruning. It must return
+ * the *weighted* estimated cost of applying a feature. This need not be the
+ * actual cost of applying the rule in context. Basically, it's the inner
+ * product of the weight vector and all features found in the grammar rule,
+ * though some features (like LanguageModelFF) can also compute some of their
+ * values. This is just an estimate of the cost, which helps do better
+ * sorting. Later, the real cost of this feature function is called via
+ * compute();
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the *weighted* cost of applying the feature.
+ */
+ public abstract float estimateCost(Rule rule, Sentence sentence);
+
+ /**
+ * This feature is called to produce a *weighted estimate* of the future cost
+ * of applying this feature. This value is not incorporated into the model
+ * score but is used in pruning decisions. Stateless features return 0.0f by
+ * default, but Stateful features might want to override this.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param state todo
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the *weighted* future cost estimate of applying this rule in
+ * context.
+ */
+ public abstract float estimateFutureCost(Rule rule, DPState state, Sentence sentence);
+
+ /**
+ * Parses the arguments passed to a feature function in the Joshua config file TODO: Replace this
+ * with a proper CLI library at some point Expects key value pairs in the form : -argname value
+ * Any key without a value is added with an empty string as value Multiple values for the same key
+ * are not parsed. The first one is used.
+ *
+ * @param args A string with the raw arguments and their names
+ * @return A hash with the keys and the values of the string
+ */
+ public static HashMap<String, String> parseArgs(String[] args) {
- HashMap<String, String> parsedArgs = new HashMap<String, String>();
++ HashMap<String, String> parsedArgs = new HashMap<>();
+ boolean lookingForValue = false;
+ String currentKey = null;
- for (int i = 0; i < args.length; i++) {
++ for (String arg : args) {
+
+ Pattern argKeyPattern = Pattern.compile("^-[a-zA-Z]\\S+");
- Matcher argKey = argKeyPattern.matcher(args[i]);
++ Matcher argKey = argKeyPattern.matcher(arg);
+ if (argKey.find()) {
+ // This is a key
+ // First check to see if there is a key that is waiting to be written
+ if (lookingForValue) {
+ // This is a key with no specified value
+ parsedArgs.put(currentKey, "");
+ }
+ // Now store the new key and look for its value
- currentKey = args[i].substring(1);
++ currentKey = arg.substring(1);
+ lookingForValue = true;
+ } else {
+ // This is a value
+ if (lookingForValue) {
- parsedArgs.put(currentKey, args[i]);
++ parsedArgs.put(currentKey, arg);
+ lookingForValue = false;
+ }
+ }
+ }
+
+ // make sure we add the last key without value
+ if (lookingForValue && currentKey != null) {
+ // end of line, no value
+ parsedArgs.put(currentKey, "");
+ }
+ return parsedArgs;
+ }
+
+ /**
+ * Accumulator objects allow us to generalize feature computation.
+ * ScoreAccumulator takes (feature,value) pairs and simple stores the weighted
+ * sum (for decoding). FeatureAccumulator records the named feature values
+ * (for k-best extraction).
+ */
+ public interface Accumulator {
+ public void add(int featureId, float value);
+ }
+
+ public class ScoreAccumulator implements Accumulator {
+ private float score;
+
+ public ScoreAccumulator() {
+ this.score = 0.0f;
+ }
+
+ @Override
+ public void add(int featureId, float value) {
+ score += value * weights.getOrDefault(featureId);
+ }
+
+ public float getScore() {
+ return score;
+ }
+ }
+
+ public class FeatureAccumulator implements Accumulator {
- private FeatureVector features;
++ private final FeatureVector features;
+
+ public FeatureAccumulator() {
+ this.features = new FeatureVector(10);
+ }
+
+ @Override
+ public void add(int id, float value) {
+ features.add(id, value);
+ }
+
+ public FeatureVector getFeatures() {
+ return features;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
index 766ea0b,0000000..9be3f88
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
@@@ -1,133 -1,0 +1,133 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+/***
+ * @author Gideon Wenniger
+ */
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.ListUtil;
+
+public class LabelSubstitutionFF extends StatelessFF {
+ private static final String MATCH_SUFFIX = "MATCH";
+ private static final String NO_MATCH_SUFFIX = "NOMATCH";
+
+ public LabelSubstitutionFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "LabelSubstitution", args, config);
+ }
+
+ public String getLowerCasedFeatureName() {
+ return name.toLowerCase();
+ }
+
+ public String getMatchFeatureSuffix(String ruleNonterminal, String substitutionNonterminal) {
+ if (ruleNonterminal.equals(substitutionNonterminal)) {
+ return MATCH_SUFFIX;
+ } else {
+ return NO_MATCH_SUFFIX;
+ }
+ }
+
+ public static String getSubstitutionSuffix(String ruleNonterminal, String substitutionNonterminal) {
+ return substitutionNonterminal + "_substitutes_" + ruleNonterminal;
+ }
+
- private final String computeLabelMatchingFeature(String ruleNonterminal,
++ private String computeLabelMatchingFeature(String ruleNonterminal,
+ String substitutionNonterminal) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getMatchFeatureSuffix(ruleNonterminal, substitutionNonterminal);
+ return result;
+ }
+
- private final String computeLabelSubstitutionFeature(String ruleNonterminal,
++ private String computeLabelSubstitutionFeature(String ruleNonterminal,
+ String substitutionNonterminal) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getSubstitutionSuffix(ruleNonterminal, substitutionNonterminal);
+ return result;
+ }
+
- private static final String getRuleLabelsDescriptorString(Rule rule) {
++ private static String getRuleLabelsDescriptorString(Rule rule) {
+ String result = "";
+ String leftHandSide = RulePropertiesQuerying.getLHSAsString(rule);
+ List<String> ruleSourceNonterminals = RulePropertiesQuerying
+ .getRuleSourceNonterminalStrings(rule);
+ boolean isInverting = rule.isInverting();
+ result += "<LHS>" + leftHandSide + "</LHS>";
+ result += "_<Nont>";
+ result += ListUtil.stringListStringWithoutBracketsCommaSeparated(ruleSourceNonterminals);
+ result += "</Nont>";
+ if(isInverting)
+ {
+ result += "_INV";
+ }
+ else
+ {
+ result += "_MONO";
+ }
+
+ return result;
+ }
+
- private static final String getSubstitutionsDescriptorString(List<HGNode> tailNodes) {
++ private static String getSubstitutionsDescriptorString(List<HGNode> tailNodes) {
+ String result = "_<Subst>";
+ List<String> substitutionNonterminals = RulePropertiesQuerying
+ .getSourceNonterminalStrings(tailNodes);
+ result += ListUtil.stringListStringWithoutBracketsCommaSeparated(substitutionNonterminals);
+ result += "</Subst>";
+ return result;
+ }
+
+ public final String getGapLabelsForRuleSubstitutionSuffix(Rule rule, List<HGNode> tailNodes) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getRuleLabelsDescriptorString(rule);
+ result += getSubstitutionsDescriptorString(tailNodes);
+ return result;
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ if (rule != null && (tailNodes != null)) {
+
+ List<String> ruleSourceNonterminals = RulePropertiesQuerying
+ .getRuleSourceNonterminalStrings(rule);
+ List<String> substitutionNonterminals = RulePropertiesQuerying
+ .getSourceNonterminalStrings(tailNodes);
+ // Assert.assertEquals(ruleSourceNonterminals.size(), substitutionNonterminals.size());
+ for (int nonterinalIndex = 0; nonterinalIndex < ruleSourceNonterminals.size(); nonterinalIndex++) {
+ String ruleNonterminal = ruleSourceNonterminals.get(nonterinalIndex);
+ String substitutionNonterminal = substitutionNonterminals.get(nonterinalIndex);
+ acc.add(hashFeature(computeLabelMatchingFeature(ruleNonterminal, substitutionNonterminal)), 1);
+ acc.add(hashFeature(computeLabelSubstitutionFeature(ruleNonterminal, substitutionNonterminal)), 1);
+ }
+ acc.add(hashFeature(getGapLabelsForRuleSubstitutionSuffix(rule, tailNodes)), 1);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
index 4eacd26,0000000..63d350e
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
@@@ -1,153 -1,0 +1,153 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.OwnerId;
+import org.apache.joshua.decoder.ff.tm.OwnerMap;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FormatUtils;
+
+import com.google.common.cache.Cache;
+
+/**
+ * Lexical alignment features denoting alignments, deletions, and insertions.
+ */
+public class LexicalFeatures extends StatelessFF {
+
+ private final boolean useAlignments;
+ private final boolean useDeletions;
+ private final boolean useInsertions;
+
+ private static final String NAME = "LexicalFeatures";
+ // value to fire for features
+ private static final int VALUE = 1;
+ //whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final OwnerId owner;
+ // Strings separating words
+ private static final String SEPARATOR = "~";
+
+ private final Cache<Rule, List<Integer>> featureCache;
+
+ public LexicalFeatures(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
- ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
++ ownerRestriction = (parsedArgs.containsKey("owner"));
+ owner = ownerRestriction ? OwnerMap.register(parsedArgs.get("owner")) : OwnerMap.UNKNOWN_OWNER_ID;
+
+ useAlignments = parsedArgs.containsKey("alignments");
+ useDeletions = parsedArgs.containsKey("deletions");
+ useInsertions = parsedArgs.containsKey("insertions");
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (ownerRestriction && rule.getOwner().equals(owner)) {
+ return null;
+ }
+
+ List<Integer> featureIds = featureCache.getIfPresent(rule);
+ if (featureIds == null) {
+ featureIds = getFeatures(rule);
+ featureCache.put(rule, featureIds);
+ }
+ for (int featureId : featureIds) {
+ acc.add(featureId, VALUE);
+ }
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature ids for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private List<Integer> getFeatures(final Rule rule) {
+ final List<Integer> result = new ArrayList<>();
+
+ byte[] alignments = rule.getAlignment();
+ if (alignments == null) {
+ return result;
+ }
+ int[] sourceWords = rule.getSource();
+ int[] targetWords = rule.getTarget();
+
+ // sourceAligned & targetAligned indicate whether an index is covered by alignments
+ boolean[] sourceAligned = new boolean[sourceWords.length];
+ boolean[] targetAligned = new boolean[targetWords.length];
+
+ // translations: aligned words
+ for (int i = 0; i < alignments.length; i+=2) {
+ byte sourceIndex = alignments[i];
+ byte targetIndex = alignments[i + 1];
+ sourceAligned[sourceIndex] = true;
+ targetAligned[targetIndex] = true;
+ if (useAlignments) {
+ result.add(hashFeature(
+ "T:" +
+ Vocabulary.word(sourceWords[sourceIndex]) +
+ SEPARATOR +
+ Vocabulary.word(targetWords[targetIndex])));
+ }
+ }
+
+ // deletions: unaligned source words
+ if (useDeletions) {
+ for (int i = 0; i < sourceAligned.length; i++) {
+ if (!sourceAligned[i] && ! FormatUtils.isNonterminal(sourceWords[i])) {
+ result.add(hashFeature("D:" + Vocabulary.word(sourceWords[i])));
+ }
+ }
+ }
+
+ // insertions: unaligned target words
+ if (useInsertions) {
+ for (int i = 0; i < targetAligned.length; i++) {
+ if (useInsertions && !targetAligned[i] && ! FormatUtils.isNonterminal(targetWords[i])) {
+ result.add(hashFeature("I:" + Vocabulary.word(targetWords[i])));
+ }
+ }
+ }
+
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 6eb1293,0000000..5e99428
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@@ -1,98 -1,0 +1,97 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.OwnerId;
+import org.apache.joshua.decoder.ff.tm.OwnerMap;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
+ * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
+ * with respect to the translation model, we create a rule that pushes that word through
+ * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
+ * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
+ * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class OOVPenalty extends StatelessFF {
+ private final OwnerId ownerID;
-
- /* The default value returned for OOVs. Can be overridden with -oov-list */
- private final float defaultValue = -100f;
++
+ private final HashMap<Integer,Float> oovWeights;
+
+ public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "OOVPenalty", args, config);
+ ownerID = OwnerMap.register("oov");
- oovWeights = new HashMap<Integer,Float>();
++ oovWeights = new HashMap<>();
+
+ if (config.oovList != null) {
+ for (OOVItem item: config.oovList) {
+ oovWeights.put(Vocabulary.id(item.label), item.weight);
+ }
+ }
+ }
+
+ /**
+ * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
+ * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
+ * cached when the feature was created.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && this.ownerID.equals(rule.getOwner())) {
+ acc.add(featureId, getValue(rule.getLHS()));
+ }
+
+ return null;
+ }
+
+ /**
+ * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
+ * rules (which are added for all words, not just ones without translation options) get sorted
+ * to the bottom during cube pruning.
+ *
+ * Important! estimateCost returns the *weighted* feature value.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && this.ownerID.equals(rule.getOwner())) {
+ return weights.getOrDefault(featureId) * getValue(rule.getLHS());
+ }
+ return 0.0f;
+ }
+
+ private float getValue(int lhs) {
++ float defaultValue = -100f;
+ return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
index acda1d2,0000000..4f6a61c
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
@@@ -1,80 -1,0 +1,80 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.OwnerId;
+import org.apache.joshua.decoder.ff.tm.OwnerMap;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature just counts rules that are used. You can restrict it with a number of flags:
+ *
+ * -owner OWNER
+ * Only count rules owned by OWNER
+ * -target|-source
+ * Only count the target or source side (plus the LHS)
+ *
+ * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold.
+ */
+public class PhrasePenalty extends StatelessFF {
+
+ private final OwnerId owner;
- private float value = 1.0f;
++ private final float value = 1.0f;
+
+ public PhrasePenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "PhrasePenalty", args, config);
+ if (parsedArgs.containsKey("owner"))
+ this.owner = OwnerMap.register(parsedArgs.get("owner"));
+ else // default
+ this.owner = OwnerMap.register("pt");
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
+ && (rule.getOwner().equals(owner))) {
+ acc.add(featureId, value);
+ }
+
+ return null;
+ }
+
+ /**
+ * Returns the *weighted* estimate.
+ *
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
+ && (rule.getOwner().equals(owner))) {
+ return weights.getOrDefault(featureId) * value;
+ }
+ return 0.0f;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index df2b180,0000000..7a08043
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@@ -1,126 -1,0 +1,126 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
+
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.OwnerId;
+import org.apache.joshua.decoder.ff.tm.OwnerMap;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+import com.google.common.cache.Cache;
+
+/**
+ * This feature fires for rule ids.
+ * Firing can be restricted to rules from a certain owner, and rule ids
+ * can be generated from source side and/or target side.
+ */
+public class RuleFF extends StatelessFF {
+
- private enum Sides { SOURCE, TARGET, BOTH };
-
++ private enum Sides { SOURCE, TARGET, BOTH }
++
+ private static final String NAME = "RuleFF";
+ // value to fire for features
+ private static final int VALUE = 1;
+ // whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final OwnerId owner;
+ // what part of the rule should be extracted;
+ private final Sides sides;
+ // Strings separating words and rule sides
+ private static final String SEPARATOR = "~";
+ private static final String SIDES_SEPARATOR = "->";
+
+ private final Cache<Rule, Integer> featureCache;
+
+ public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
- ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
++ ownerRestriction = (parsedArgs.containsKey("owner"));
+ owner = ownerRestriction ? OwnerMap.register(parsedArgs.get("owner")) : UNKNOWN_OWNER_ID;
+
+ if (parsedArgs.containsKey("sides")) {
+ final String sideValue = parsedArgs.get("sides");
+ if (sideValue.equalsIgnoreCase("source")) {
+ sides = Sides.SOURCE;
+ } else if (sideValue.equalsIgnoreCase("target")) {
+ sides = Sides.TARGET;
+ } else if (sideValue.equalsIgnoreCase("both")){
+ sides = Sides.BOTH;
+ } else {
+ throw new RuntimeException("Unknown side value.");
+ }
+ } else {
+ sides = Sides.BOTH;
+ }
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (ownerRestriction && !rule.getOwner().equals(owner)) {
+ return null;
+ }
+
+ Integer featureId = featureCache.getIfPresent(rule);
+ if (featureId == null) {
+ featureId = hashRuleFeature(rule);
+ featureCache.put(rule, featureId);
+ }
+ acc.add(featureId, VALUE);
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature id for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private int hashRuleFeature(final Rule rule) {
+ final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+ .append(SIDES_SEPARATOR);
+ if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getSource(), SEPARATOR));
+ }
+ sb.append(SIDES_SEPARATOR);
+ if (sides == Sides.TARGET || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getTarget(), SEPARATOR));
+ }
+ return FeatureMap.hashFeature(sb.toString());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
index a1867a3,0000000..0ee41be
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
@@@ -1,49 -1,0 +1,49 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+
+public class RulePropertiesQuerying {
+
- public static final String getLHSAsString(Rule rule) {
++ public static String getLHSAsString(Rule rule) {
+ return Vocabulary.word(rule.getLHS());
+ }
+
+ public static List<String> getRuleSourceNonterminalStrings(Rule rule) {
- List<String> result = new ArrayList<String>();
++ List<String> result = new ArrayList<>();
+ for (int nonTerminalIndex : rule.getForeignNonTerminals()) {
+ result.add(Vocabulary.word(nonTerminalIndex));
+ }
+ return result;
+ }
+
+ public static List<String> getSourceNonterminalStrings(List<HGNode> tailNodes) {
- List<String> result = new ArrayList<String>();
++ List<String> result = new ArrayList<>();
+ for (HGNode tailNode : tailNodes) {
+ result.add(Vocabulary.word(tailNode.lhs));
+ }
+ return result;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
index b389774,0000000..eb7bd50
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
@@@ -1,101 -1,0 +1,101 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FormatUtils;
+
+/*
+ * Implements the RuleShape feature for source, target, and paired source+target sides.
+ */
+public class RuleShape extends StatelessFF {
+
+ public RuleShape(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleShape", args, config);
+ }
+
+ private enum WordType {
+ N("N"), T("x"), P("+");
+ private final String string;
+ private boolean repeats;
+
- private WordType(final String string) {
++ WordType(final String string) {
+ this.string = string;
+ this.repeats = false;
+ }
+
+ private void setRepeats() {
+ repeats = true;
+ }
+
+ @Override
+ public String toString() {
+ if (repeats) {
+ return this.string + "+";
+ }
+ return this.string;
+ }
+ }
+
+ private WordType getWordType(int id) {
+ if (FormatUtils.isNonterminal(id)) {
+ return WordType.N;
+ } else {
+ return WordType.T;
+ }
+ }
+
+ /**
+ * Returns a String describing the rule pattern.
+ */
+ private String getRulePattern(int[] ids) {
+ final StringBuilder pattern = new StringBuilder();
+ WordType currentType = getWordType(ids[0]);
+ for (int i = 1; i < ids.length; i++) {
+ if (getWordType(ids[i]) != currentType) {
+ pattern.append(currentType.toString());
+ currentType = getWordType(ids[i]);
+ } else {
+ currentType.setRepeats();
+ }
+ }
+ pattern.append(currentType.toString());
+ return pattern.toString();
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ final String sourceShape = getRulePattern(rule.getSource());
+ final String targetShape = getRulePattern(rule.getTarget());
+ acc.add(hashFeature(name + "_source_" + sourceShape), 1);
+ acc.add(hashFeature(name + "_target_" + sourceShape), 1);
+ acc.add(hashFeature(name + "_sourceTarget_" + sourceShape + "_" + targetShape), 1);
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
index 841402a,0000000..dec509f
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
@@@ -1,29 -1,0 +1,29 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+public interface SourceDependentFF extends Cloneable {
+
- public void setSource(Sentence sentence);
++ void setSource(Sentence sentence);
+
- public FeatureFunction clone();
++ FeatureFunction clone();
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
index cb902a0,0000000..1d0e6e7
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
@@@ -1,53 -1,0 +1,53 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature returns the scored path through the source lattice, which is recorded in a
+ * SourcePath object.
+ *
+ * @author Chris Dyer redpony@umd.edu
+ * @author Matt Post post@cs.jhu.edu
+ */
+public final class SourcePathFF extends StatelessFF {
+
+ /*
+ * This is a single-value feature template, so we cache the weight here.
+ */
+ public SourcePathFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "SourcePath", args, config);
+ }
-
++
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ acc.add(featureId, sourcePath.getPathCost());
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
index 888fa03,0000000..9338b0d
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
@@@ -1,218 -1,0 +1,215 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.io.LineReader;
+
+/***
+ * The RuleBigram feature is an indicator feature that counts target word bigrams that are created when
+ * a rule is applied. It accepts three parameters:
+ *
+ * -vocab /path/to/vocab
+ *
+ * The path to a vocabulary, where each line is of the format ID WORD COUNT.
+ *
+ * -threshold N
+ *
+ * Mask to UNK all words whose COUNT is less than N.
+ *
+ * -top-n N
+ *
+ * Only use the top N words.
+ */
+
+public class TargetBigram extends StatefulFF {
+
+ private HashSet<String> vocab = null;
+ private int maxTerms = 1000000;
+ private int threshold = 0;
+
+ public TargetBigram(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "TargetBigram", args, config);
+
+ if (parsedArgs.containsKey("threshold"))
+ threshold = Integer.parseInt(parsedArgs.get("threshold"));
+
+ if (parsedArgs.containsKey("top-n"))
+ maxTerms = Integer.parseInt(parsedArgs.get("top-n"));
+
+ if (parsedArgs.containsKey("vocab")) {
+ loadVocab(parsedArgs.get("vocab"));
+ }
+ }
+
+ /**
+ * Load vocabulary items passing the 'threshold' and 'top-n' filters.
+ *
+ * @param filename
+ */
+ private void loadVocab(String filename) {
- this.vocab = new HashSet<String>();
++ this.vocab = new HashSet<>();
+ this.vocab.add("<s>");
+ this.vocab.add("</s>");
+ try {
+ LineReader lineReader = new LineReader(filename);
+ for (String line: lineReader) {
+ if (lineReader.lineno() > maxTerms)
+ break;
+
+ String[] tokens = line.split("\\s+");
+ String word = tokens[1];
+ int count = Integer.parseInt(tokens[2]);
+
+ if (count >= threshold)
+ vocab.add(word);
+ }
+
+ } catch (IOException e) {
+ throw new RuntimeException(String.format(
+ "* FATAL: couldn't load TargetBigram vocabulary '%s'", filename), e);
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int spanStart, int spanEnd,
+ SourcePath sourcePath, Sentence sentence, Accumulator acc) {
+
+ int[] enWords = rule.getTarget();
+
+ int left = -1;
+ int right = -1;
+
- List<String> currentNgram = new LinkedList<String>();
- for (int c = 0; c < enWords.length; c++) {
- int curID = enWords[c];
-
++ List<String> currentNgram = new LinkedList<>();
++ for (int curID : enWords) {
+ if (FormatUtils.isNonterminal(curID)) {
+ int index = -(curID + 1);
+ NgramDPState state = (NgramDPState) tailNodes.get(index).getDPState(stateIndex);
+ int[] leftContext = state.getLeftLMStateWords();
+ int[] rightContext = state.getRightLMStateWords();
+
+ // Left context.
+ for (int token : leftContext) {
+ currentNgram.add(getWord(token));
+ if (left == -1)
+ left = token;
+ right = token;
+ if (currentNgram.size() == 2) {
+ String ngram = join(currentNgram);
+ acc.add(hashFeature(String.format("%s_%s", name, ngram)), 1);
+ // System.err.println(String.format("ADDING %s_%s", name, ngram));
+ currentNgram.remove(0);
+ }
+ }
+ // Replace right context.
+ int tSize = currentNgram.size();
+ for (int i = 0; i < rightContext.length; i++)
+ currentNgram.set(tSize - rightContext.length + i, getWord(rightContext[i]));
+
+ } else { // terminal words
+ currentNgram.add(getWord(curID));
+ if (left == -1)
+ left = curID;
+ right = curID;
+ if (currentNgram.size() == 2) {
+ String ngram = join(currentNgram);
+ acc.add(hashFeature(String.format("%s_%s", name, ngram)), 1);
+ // System.err.println(String.format("ADDING %s_%s", name, ngram));
+ currentNgram.remove(0);
+ }
+ }
+ }
+
- NgramDPState state = new NgramDPState(new int[] { left }, new int[] { right });
+ // System.err.println(String.format("RULE %s -> state %s", rule.getRuleString(), state));
- return state;
++ return new NgramDPState(new int[] { left }, new int[] { right });
+ }
+
+ /**
+ * Returns the word after comparing against the private vocabulary (if set).
+ *
+ * @param curID
+ * @return the word
+ */
+ private String getWord(int curID) {
+ String word = Vocabulary.word(curID);
+
+ if (vocab != null && ! vocab.contains(word)) {
+ return "UNK";
+ }
+
+ return word;
+ }
+
+ /**
+ * We don't compute a future cost.
+ */
+ @Override
+ public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
+ return 0.0f;
+ }
+
+ /**
+ * There is nothing to be done here, since <s> and </s> are included in rules that are part
+ * of the grammar. We simply return the DP state of the tail node.
+ */
+ @Override
+ public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ return tailNode.getDPState(stateIndex);
+ }
+
+ /**
+ * TargetBigram features are only computed across hyperedges, so there is nothing to be done here.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ return 0.0f;
+ }
+
+ /**
+ * Join a list with the _ character. I am sure this is in a library somewhere.
+ *
+ * @param list a list of strings
+ * @return the joined String
+ */
+ private String join(List<String> list) {
+ StringBuilder sb = new StringBuilder();
+ for (String item : list) {
- sb.append(item.toString() + "_");
++ sb.append(item).append("_");
+ }
+
+ return sb.substring(0, sb.length() - 1);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java
index f75dffa,0000000..1d181e7
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/ConcatenationIterator.java
@@@ -1,93 -1,0 +1,91 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.fragmentlm;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * Concatenates an iterator over iterators into one long iterator.
+ *
+ * @author Dan Klein
+ */
+public class ConcatenationIterator<E> implements Iterator<E> {
+
- Iterator<Iterator<E>> sourceIterators;
++ final Iterator<Iterator<E>> sourceIterators;
+ Iterator<E> currentIterator;
+ Iterator<E> lastIteratorToReturn;
+
+ public boolean hasNext() {
- if (currentIterator.hasNext())
- return true;
- return false;
++ return currentIterator.hasNext();
+ }
+
+ public E next() {
+ if (currentIterator.hasNext()) {
+ E e = currentIterator.next();
+ lastIteratorToReturn = currentIterator;
+ advance();
+ return e;
+ }
+ throw new NoSuchElementException();
+ }
+
+ private void advance() {
+ while (! currentIterator.hasNext() && sourceIterators.hasNext()) {
+ currentIterator = sourceIterators.next();
+ }
+ }
+
+ public void remove() {
+ if (lastIteratorToReturn == null)
+ throw new IllegalStateException();
+ currentIterator.remove();
+ }
+
+ public ConcatenationIterator(Iterator<Iterator<E>> sourceIterators) {
+ this.sourceIterators = sourceIterators;
+ this.currentIterator = (new ArrayList<E>()).iterator();
+ this.lastIteratorToReturn = null;
+ advance();
+ }
+
+ public ConcatenationIterator(Collection<Iterator<E>> iteratorCollection) {
+ this(iteratorCollection.iterator());
+ }
+
+ public static void main(String[] args) {
+ List<String> list0 = Collections.emptyList();
+ List<String> list1 = Arrays.asList("a b c d".split(" "));
+ List<String> list2 = Arrays.asList("e f".split(" "));
- List<Iterator<String>> iterators = new ArrayList<Iterator<String>>();
++ List<Iterator<String>> iterators = new ArrayList<>();
+ iterators.add(list1.iterator());
+ iterators.add(list0.iterator());
+ iterators.add(list2.iterator());
+ iterators.add(list0.iterator());
- Iterator<String> iterator = new ConcatenationIterator<String>(iterators);
++ Iterator<String> iterator = new ConcatenationIterator<>(iterators);
+ while (iterator.hasNext()) {
+ System.out.println(iterator.next());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
index 5d6780b,0000000..5332135
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
@@@ -1,324 -1,0 +1,317 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.fragmentlm;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Stack;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>Feature function that reads in a list of language model fragments and matches them against the
+ * hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments
+ * (supplied as input) are assembled. These LM fragments are presumably useful in ensuring
+ * grammaticality and can be independent of the translation model fragments.</p>
+ *
+ * <p>Usage: in the Joshua Configuration file, put</p>
+ *
+ * <code>feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE</code>
+ *
+ * <p>LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for.
+ * The format of the file is one fragment per line in PTB format, e.g.:</p>
+ *
+ * <code>(S NP (VP (VBD said) SBAR) (. .))</code>
+ *
+ * <p>RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format
+ * that Joshua uses. This mapping is necessary because Joshua's rules have been flattened, meaning
+ * that their internal structure has been removed, yet this structure is needed for matching LM
+ * fragments. The format of the file is</p>
+ *
+ * <code>FRAGMENT ||| RULE-TARGET-SIDE</code>
+ *
+ * <p>for example,</p>
+ *
+ * <code>(S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP
+ * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR</code>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class FragmentLMFF extends StatefulFF {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FragmentLMFF.class);
+
+ /*
+ * When building a fragment from a rule rooted in the hypergraph, this parameter determines how
+ * deep we'll go. Smaller values mean less hypergraph traversal but may also limit the LM
+ * fragments that can be fired.
+ */
+ private int BUILD_DEPTH = 1;
+
+ /*
+ * The maximum depth of a fragment, defined as the longest path from the fragment root to any of
+ * its leaves.
+ */
+ private int MAX_DEPTH = 0;
+
+ /*
+ * This is the minimum depth for lexicalized LM fragments. This allows you to easily exclude small
+ * depth-one fragments that may be overfit to the training data. A depth of 1 (the default) does
+ * not exclude any fragments.
+ */
+ private int MIN_LEX_DEPTH = 1;
+
+ /*
- * Set to true to activate meta-features.
- */
- private boolean OPTS_DEPTH = false;
-
- /*
+ * This contains a list of the language model fragments, indexed by LHS.
+ */
+ private HashMap<String, ArrayList<Tree>> lmFragments = null;
+
+ private int numFragments = 0;
+
+ /* The location of the file containing the language model fragments */
+ private String fragmentLMFile = "";
+
+ /**
+ * @param weights a {@link org.apache.joshua.decoder.ff.FeatureVector} with weights
+ * @param args arguments passed to the feature function
+ * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "FragmentLMFF", args, config);
+
- lmFragments = new HashMap<String, ArrayList<Tree>>();
++ lmFragments = new HashMap<>();
+
+ fragmentLMFile = parsedArgs.get("lm");
+ BUILD_DEPTH = Integer.parseInt(parsedArgs.get("build-depth"));
+ MAX_DEPTH = Integer.parseInt(parsedArgs.get("max-depth"));
+ MIN_LEX_DEPTH = Integer.parseInt(parsedArgs.get("min-lex-depth"));
+
+ /* Read in the language model fragments */
+ try {
+ Collection<Tree> trees = PennTreebankReader.readTrees(fragmentLMFile);
- for (Tree fragment : trees) {
- addLMFragment(fragment);
-
- // System.err.println(String.format("Read fragment: %s",
- // lmFragments.get(lmFragments.size()-1)));
- }
++ // System.err.println(String.format("Read fragment: %s",
++ // lmFragments.get(lmFragments.size()-1)));
++ trees.forEach(this::addLMFragment);
+ } catch (IOException e) {
+ throw new RuntimeException(String.format("* WARNING: couldn't read fragment LM file '%s'",
+ fragmentLMFile), e);
+ }
+ LOG.info("FragmentLMFF: Read {} LM fragments from '{}'", numFragments, fragmentLMFile);
+ }
+
+ /**
+ * Add the provided fragment to the language model, subject to some filtering.
+ *
+ * @param fragment a {@link org.apache.joshua.decoder.ff.fragmentlm.Tree} fragment
+ */
+ public void addLMFragment(Tree fragment) {
+ if (lmFragments == null)
+ return;
+
+ int fragmentDepth = fragment.getDepth();
+
+ if (MAX_DEPTH != 0 && fragmentDepth > MAX_DEPTH) {
+ LOG.warn("Skipping fragment {} (depth {} > {})", fragment, fragmentDepth, MAX_DEPTH);
+ return;
+ }
+
+ if (MIN_LEX_DEPTH > 1 && fragment.isLexicalized() && fragmentDepth < MIN_LEX_DEPTH) {
+ LOG.warn("Skipping fragment {} (lex depth {} < {})", fragment, fragmentDepth, MIN_LEX_DEPTH);
+ return;
+ }
+
+ if (lmFragments.get(fragment.getRule()) == null) {
- lmFragments.put(fragment.getRule(), new ArrayList<Tree>());
++ lmFragments.put(fragment.getRule(), new ArrayList<>());
+ }
+ lmFragments.get(fragment.getRule()).add(fragment);
+ numFragments++;
+ }
+
+ /**
+ * This function computes the features that fire when the current rule is applied. The features
+ * that fire are any LM fragments that match the fragment associated with the current rule. LM
+ * fragments may recurse over the tail nodes, following 1-best backpointers until the fragment
+ * either matches or fails.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the new dynamic programming state (null for stateless features)
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ /*
+ * Get the fragment associated with the target side of this rule.
+ *
+ * This could be done more efficiently. For example, just build the tree fragment once and then
+ * pattern match against it. This would circumvent having to build the tree possibly once every
+ * time you try to apply a rule.
+ */
+ Tree baseTree = Tree.buildTree(rule, tailNodes, BUILD_DEPTH);
+
- Stack<Tree> nodeStack = new Stack<Tree>();
++ Stack<Tree> nodeStack = new Stack<>();
+ nodeStack.add(baseTree);
+ while (!nodeStack.empty()) {
+ Tree tree = nodeStack.pop();
+ if (tree == null)
+ continue;
+
+ if (lmFragments.get(tree.getRule()) != null) {
+ for (Tree fragment : lmFragments.get(tree.getRule())) {
+// System.err.println(String.format("Does\n %s match\n %s??\n -> %s", fragment, tree,
+// match(fragment, tree)));
+
+ if (fragment.getLabel() == tree.getLabel() && match(fragment, tree)) {
+// System.err.println(String.format(" FIRING: matched %s against %s", fragment, tree));
+ acc.add(hashFeature(fragment.escapedString()), 1);
++ boolean OPTS_DEPTH = false;
+ if (OPTS_DEPTH)
+ if (fragment.isLexicalized())
+ acc.add(hashFeature(String.format("FragmentFF_lexdepth%d", fragment.getDepth())), 1);
+ else
+ acc.add(hashFeature(String.format("FragmentFF_depth%d", fragment.getDepth())), 1);
+ }
+ }
+ }
+
+ // We also need to try matching rules against internal nodes of the fragment corresponding to
+ // this
+ // rule
+ if (tree.getChildren() != null)
+ for (Tree childNode : tree.getChildren()) {
+ if (!childNode.isBoundary())
+ nodeStack.add(childNode);
+ }
+ }
+
+ return new FragmentState(baseTree);
+ }
+
+ /**
+ * Matches the fragment against the (possibly partially-built) tree. Assumption
+ *
+ * @param fragment the language model fragment
+ * @param tree the tree to match against (expanded from the hypergraph)
+ * @return
+ */
+ private boolean match(Tree fragment, Tree tree) {
+ // System.err.println(String.format("MATCH(%s,%s)", fragment, tree));
+
+ /* Make sure the root labels match. */
+ if (fragment.getLabel() != tree.getLabel()) {
+ return false;
+ }
+
+ /* Same number of kids? */
+ List<Tree> fkids = fragment.getChildren();
+ if (fkids.size() > 0) {
+ List<Tree> tkids = tree.getChildren();
+ if (fkids.size() != tkids.size()) {
+ return false;
+ }
+
+ /* Do the kids match on all labels? */
+ for (int i = 0; i < fkids.size(); i++)
+ if (fkids.get(i).getLabel() != tkids.get(i).getLabel())
+ return false;
+
+ /* Recursive match. */
+ for (int i = 0; i < fkids.size(); i++) {
+ if (!match(fkids.get(i), tkids.get(i)))
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public DPState computeFinal(HGNode tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence,
+ Accumulator acc) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ /**
+ * Maintains a state pointer used by KenLM to implement left-state minimization.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ */
+ public class FragmentState extends DPState {
+
+ private Tree tree = null;
+
+ public FragmentState(Tree tree) {
+ this.tree = tree;
+ }
+
+ /**
+ * Every tree is unique.
+ *
+ * Some savings could be had here if we grouped together items with the same string.
+ */
+ @Override
+ public int hashCode() {
+ return tree.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return (other instanceof FragmentState && this == other);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("[FragmentState %s]", tree);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
index 1637b5f,0000000..bb1c29a
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
@@@ -1,135 -1,0 +1,134 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.fragmentlm;
+
+import java.util.*;
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+
+/**
+ * @author Dan Klein
+ */
+public class PennTreebankReader {
+
+ static class TreeCollection extends AbstractCollection<Tree> {
+
- List<File> files;
- Charset charset;
++ final List<File> files;
++ final Charset charset;
+
+ static class TreeIteratorIterator implements Iterator<Iterator<Tree>> {
- Iterator<File> fileIterator;
++ final Iterator<File> fileIterator;
+ Iterator<Tree> nextTreeIterator;
- Charset charset;
++ final Charset charset;
+
+ public boolean hasNext() {
+ return nextTreeIterator != null;
+ }
+
+ public Iterator<Tree> next() {
+ Iterator<Tree> currentTreeIterator = nextTreeIterator;
+ advance();
+ return currentTreeIterator;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ private void advance() {
+ nextTreeIterator = null;
+ while (nextTreeIterator == null && fileIterator.hasNext()) {
+ File file = fileIterator.next();
+ // System.out.println(file);
+ try {
+ nextTreeIterator = new Trees.PennTreeReader(new BufferedReader(new InputStreamReader(
+ new FileInputStream(file), this.charset)));
+ } catch (FileNotFoundException e) {
+ } catch (UnsupportedCharsetException e) {
+ throw new Error("Unsupported charset in file " + file.getPath());
+ }
+ }
+ }
+
+ TreeIteratorIterator(List<File> files, Charset charset) {
+ this.fileIterator = files.iterator();
+ this.charset = charset;
+ advance();
+ }
+ }
+
+ public Iterator<Tree> iterator() {
- return new ConcatenationIterator<Tree>(new TreeIteratorIterator(files, this.charset));
++ return new ConcatenationIterator<>(new TreeIteratorIterator(files, this.charset));
+ }
+
+ public int size() {
+ int size = 0;
+ Iterator<Tree> i = iterator();
+ while (i.hasNext()) {
+ size++;
+ i.next();
+ }
+ return size;
+ }
+
+ @SuppressWarnings("unused")
+ private List<File> getFilesUnder(String path, FileFilter fileFilter) {
+ File root = new File(path);
- List<File> files = new ArrayList<File>();
++ List<File> files = new ArrayList<>();
+ addFilesUnder(root, files, fileFilter);
+ return files;
+ }
+
+ private void addFilesUnder(File root, List<File> files, FileFilter fileFilter) {
+ if (!fileFilter.accept(root))
+ return;
+ if (root.isFile()) {
+ files.add(root);
+ return;
+ }
+ if (root.isDirectory()) {
+ File[] children = root.listFiles();
- for (int i = 0; i < children.length; i++) {
- File child = children[i];
++ for (File child : children) {
+ addFilesUnder(child, files, fileFilter);
+ }
+ }
+ }
+
+ public TreeCollection(String file) throws FileNotFoundException, IOException {
- this.files = new ArrayList<File>();
++ this.files = new ArrayList<>();
+ this.files.add(new File(file));
+ this.charset = Charset.defaultCharset();
+ }
+ }
+
+ public static Collection<Tree> readTrees(String path) throws FileNotFoundException, IOException {
+ return new TreeCollection(path);
+ }
+
+ public static void main(String[] args) {
+/* Collection<Tree> trees = readTrees(args[0], Charset.defaultCharset());
+ for (Tree tree : trees) {
+ tree = (new Trees.StandardTreeNormalizer()).transformTree(tree);
+ System.out.println(Trees.PennTreeRenderer.render(tree));
+ }
+ */
+ }
+
+}