You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:52 UTC
[41/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
new file mode 100644
index 0000000..6f231ae
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * <p>This class defines Joshua's feature function interface, for both sparse and
+ * dense features. It is immediately inherited by StatelessFF and StatefulFF,
+ * which provide functionality common to stateless and stateful features,
+ * respectively. Any feature implementation should extend those classes, and not
+ * this one. The distinction between stateless and stateful features is somewhat
+ * narrow: all features have the opportunity to return an instance of a
+ * {@link DPState} object, and stateless ones just return null.</p>
+ *
+ * <p>Features in Joshua work like templates. Each feature function defines any
+ * number of actual features, which are associated with weights. The task of the
+ * feature function is to compute the features that are fired in different
+ * circumstances and then return the inner product of those features with the
+ * weight vector. Feature functions can also produce estimates of their future
+ * cost (via {@link org.apache.joshua.decoder.ff.FeatureFunction#estimateCost(Rule, Sentence)});
+ * these values are not used in computing the
+ * score, but are only used for sorting rules during cube pruning. The
+ * individual features produced by each template should have globally unique
+ * names; a good convention is to prefix each feature with the name of the
+ * template that produced it.</p>
+ *
+ * <p>Joshua does not retain individual feature values while decoding, since this
+ * requires keeping a sparse feature vector along every hyperedge, which can be
+ * expensive. Instead, it computes only the weighted cost of each edge. If the
+ * individual feature values are requested, the feature functions are replayed
+ * in post-processing, say during k-best list extraction. This is implemented in
+ * a generic way by passing an {@link Accumulator} object to the compute()
+ * function. During decoding, the accumulator simply sums weighted features in a
+ * scalar. During k-best extraction, when individual feature values are needed,
+ * a {@link FeatureAccumulator} is used to retain the individual values.</p>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevich juri@cs.jhu.edu
+ */
+public abstract class FeatureFunction {
+
+ /*
+ * The name of the feature function; this generally matches the weight name on
+ * the config file. This can also be used as a prefix for feature / weight
+ * names, for templates that define multiple features.
+ */
+ protected String name = null;
+
+ /*
+ * The list of features each function can contribute, along with the dense feature IDs.
+ */
+ protected String[] denseFeatureNames = null;
+ protected int[] denseFeatureIDs = null;
+
+ /*
+ * The first dense feature index
+ */
+ protected int denseFeatureIndex = -1;
+
+ // The list of arguments passed to the feature, and the hash for the parsed args
+ protected String[] args;
+ protected HashMap<String, String> parsedArgs = null;
+
+ /*
+ * The global weight vector used by the decoder, passed it when the feature is
+ * instantiated
+ */
+ protected FeatureVector weights;
+
+ /* The config */
+ protected JoshuaConfiguration config;
+
+ public String getName() {
+ return name;
+ }
+
+ // Whether the feature has state.
+ public abstract boolean isStateful();
+
+ public FeatureFunction(FeatureVector weights, String name, String[] args, JoshuaConfiguration config) {
+ this.weights = weights;
+ this.name = name;
+ this.args = args;
+ this.config = config;
+
+ this.parsedArgs = FeatureFunction.parseArgs(args);
+ }
+
+ /**
+ * Any feature function can use this to report dense features names to the master code. The
+ * parameter tells the feature function the index of the first available dense feature ID; the feature
+ * function will then use IDs (id..id+names.size()-1).
+ *
+ * @param id the id of the first dense feature id to use
+ * @return a list of dense feature names
+ */
+ public ArrayList<String> reportDenseFeatures(int id) {
+ return new ArrayList<String>();
+ }
+
+ public String logString() {
+ try {
+ return String.format("%s (weight %.3f)", name, weights.getSparse(name));
+ } catch (RuntimeException e) {
+ return name;
+ }
+ }
+
+ /**
+ * This is the main function for defining feature values. The implementor
+ * should compute all the features along the hyperedge, calling
+ * {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator#add(String, float)}
+ * for each feature. It then returns the newly-computed dynamic
+ * programming state for this feature (for example, for the
+ * {@link org.apache.joshua.decoder.ff.lm.LanguageModelFF} feature, this returns the new language model
+ * context). For stateless features, this value is null.
+ *
+ * Note that the accumulator accumulates *unweighted* feature values. The
+ * feature vector is multiplied times the weight vector later on.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the new dynamic programming state (null for stateless features)
+ */
+ public abstract DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence, Accumulator acc);
+
+ /**
+ * Feature functions must overrided this. StatefulFF and StatelessFF provide
+ * reasonable defaults since most features do not fire on the goal node.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the DPState (null if none)
+ */
+ public abstract DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc);
+
+ /**
+ * This is a convenience function for retrieving the features fired when
+ * applying a rule, provided for backward compatibility.
+ *
+ * Returns the *unweighted* cost of the features delta computed at this
+ * position. Note that this is a feature delta, so existing feature costs of
+ * the tail nodes should not be incorporated, and it is very important not to
+ * incorporate the feature weights. This function is used in the kbest
+ * extraction code but could also be used in computing the cost.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return an *unweighted* feature delta
+ */
+ public final FeatureVector computeFeatures(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence) {
+
+ FeatureAccumulator features = new FeatureAccumulator();
+ compute(rule, tailNodes, i, j, sourcePath, sentence, features);
+ return features.getFeatures();
+ }
+
+ /**
+ * This function is called for the final transition. For example, the
+ * LanguageModel feature function treats the last rule specially. It needs to
+ * return the *weighted* cost of applying the feature. Provided for backward
+ * compatibility.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return a *weighted* feature cost
+ */
+ public final float computeFinalCost(HGNode tailNode, int i, int j, SourcePath sourcePath,
+ Sentence sentence) {
+
+ ScoreAccumulator score = new ScoreAccumulator();
+ computeFinal(tailNode, i, j, sourcePath, sentence, score);
+ return score.getScore();
+ }
+
+ /**
+ * Returns the *unweighted* feature delta for the final transition (e.g., for
+ * the language model feature function). Provided for backward compatibility.
+ *
+ * @param tailNode single {@link org.apache.joshua.decoder.hypergraph.HGNode} representing tail node
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return an *weighted* feature vector
+ */
+ public final FeatureVector computeFinalFeatures(HGNode tailNode, int i, int j,
+ SourcePath sourcePath, Sentence sentence) {
+
+ FeatureAccumulator features = new FeatureAccumulator();
+ computeFinal(tailNode, i, j, sourcePath, sentence, features);
+ return features.getFeatures();
+ }
+
+ /**
+ * This function is called when sorting rules for cube pruning. It must return
+ * the *weighted* estimated cost of applying a feature. This need not be the
+ * actual cost of applying the rule in context. Basically, it's the inner
+ * product of the weight vector and all features found in the grammar rule,
+ * though some features (like LanguageModelFF) can also compute some of their
+ * values. This is just an estimate of the cost, which helps do better
+ * sorting. Later, the real cost of this feature function is called via
+ * compute();
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the *weighted* cost of applying the feature.
+ */
+ public abstract float estimateCost(Rule rule, Sentence sentence);
+
+ /**
+ * This feature is called to produce a *weighted estimate* of the future cost
+ * of applying this feature. This value is not incorporated into the model
+ * score but is used in pruning decisions. Stateless features return 0.0f by
+ * default, but Stateful features might want to override this.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param state todo
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the *weighted* future cost estimate of applying this rule in
+ * context.
+ */
+ public abstract float estimateFutureCost(Rule rule, DPState state, Sentence sentence);
+
+ /**
+ * Parses the arguments passed to a feature function in the Joshua config file TODO: Replace this
+ * with a proper CLI library at some point Expects key value pairs in the form : -argname value
+ * Any key without a value is added with an empty string as value Multiple values for the same key
+ * are not parsed. The first one is used.
+ *
+ * @param args A string with the raw arguments and their names
+ * @return A hash with the keys and the values of the string
+ */
+ public static HashMap<String, String> parseArgs(String[] args) {
+ HashMap<String, String> parsedArgs = new HashMap<String, String>();
+ boolean lookingForValue = false;
+ String currentKey = "";
+ for (int i = 0; i < args.length; i++) {
+
+ Pattern argKeyPattern = Pattern.compile("^-[a-zA-Z]\\S+");
+ Matcher argKey = argKeyPattern.matcher(args[i]);
+ if (argKey.find()) {
+ // This is a key
+ // First check to see if there is a key that is waiting to be written
+ if (lookingForValue) {
+ // This is a key with no specified value
+ parsedArgs.put(currentKey, "");
+ }
+ // Now store the new key and look for its value
+ currentKey = args[i].substring(1);
+ lookingForValue = true;
+ } else {
+ // This is a value
+ if (lookingForValue) {
+ parsedArgs.put(currentKey, args[i]);
+ lookingForValue = false;
+ }
+ }
+ }
+ return parsedArgs;
+ }
+
+ /**
+ * Accumulator objects allow us to generalize feature computation.
+ * ScoreAccumulator takes (feature,value) pairs and simple stores the weighted
+ * sum (for decoding). FeatureAccumulator records the named feature values
+ * (for k-best extraction).
+ */
+ public interface Accumulator {
+ public void add(String name, float value);
+ public void add(int id, float value);
+ }
+
+ public class ScoreAccumulator implements Accumulator {
+ private float score;
+
+ public ScoreAccumulator() {
+ this.score = 0.0f;
+ }
+
+ @Override
+ public void add(String name, float value) {
+ score += value * weights.getSparse(name);
+ }
+
+ @Override
+ public void add(int id, float value) {
+ score += value * weights.getDense(id);
+ }
+
+ public float getScore() {
+ return score;
+ }
+ }
+
+ public class FeatureAccumulator implements Accumulator {
+ private FeatureVector features;
+
+ public FeatureAccumulator() {
+ this.features = new FeatureVector();
+ }
+
+ @Override
+ public void add(String name, float value) {
+ features.increment(name, value);
+ }
+
+ @Override
+ public void add(int id, float value) {
+ features.increment(id, value);
+ }
+
+ public FeatureVector getFeatures() {
+ return features;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java
new file mode 100644
index 0000000..1b39c78
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/FeatureVector.java
@@ -0,0 +1,385 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * An implementation of a sparse feature vector, using for representing both weights and feature
+ * values.
+ *
+ * This class is used to hold both the decoder weights and the feature values accumulated across
+ * each edge. When features are read in upon decoder startup, they all start out as sparse features
+ * and are stored in the hash table. After the feature functions have been loaded, the decoder
+ * queries each of them for their sparse features via {@link registerDenseFeatures}. Those features
+ * returned by each decoder are then *removed* from the sparse feature hash and placed in the dense
+ * feature array. Therefore, when a feature registers a dense feature, it should take care to
+ * query either {@link org.apache.joshua.decoder.ff.FeatureVector#getDense(int)} or
+ * {@link org.apache.joshua.decoder.ff.FeatureVector#getSparse(String)} when asking for the feature
+ * values later on.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class FeatureVector {
+ /*
+ * A list of the dense feature names. Increased via calls to registerDenseFeatures()
+ */
+ public static ArrayList<String> DENSE_FEATURE_NAMES = new ArrayList<String>();
+
+ /*
+ * The values of each of the dense features, defaulting to 0.
+ */
+ private ArrayList<Float> denseFeatures = null;
+
+ /*
+ * Value of sparse features.
+ */
+ private HashMap<String, Float> sparseFeatures;
+
+ public FeatureVector() {
+ sparseFeatures = new HashMap<String, Float>();
+ denseFeatures = new ArrayList<Float>(DENSE_FEATURE_NAMES.size());
+ for (int i = 0; i < denseFeatures.size(); i++)
+ denseFeatures.set(i, 0.0f);
+ }
+
+ /**
+ * This version of the constructor takes an uninitialized feature with potentially intermingled
+ * labeled and unlabeled feature values, of the format:
+ *
+ * [feature1=]value [feature2=]value
+ *
+ * It produces a Feature Vector where all unlabeled features have been labeled by appending the
+ * unlabeled feature index (starting at 0) to the defaultPrefix value.
+ *
+ * **IMPORTANT** The feature values are inverted, for historical reasons, which leads to a lot
+ * of confusion. They have to be inverted here and when the score is actually computed. They
+ * are inverted here (which is used to build the feature vector representation of a rule's dense
+ * features) and in {@link org.apache.joshua.decoder.ff.tm.Rule#estimateRuleCost(java.util.List)}
+ * , where the rule's precomputable (weighted) score is cached.
+ *
+ * @param featureString, the string of labeled and unlabeled features (probably straight from the
+ * grammar text file)
+ * @param prefix, the prefix to use for unlabeled features (probably "tm_OWNER_")
+ */
+ public FeatureVector(String featureString, String prefix) {
+
+// System.err.println(String.format("FEATURES_OF(%s, %s)", featureString, prefix));
+
+ /*
+ * Read through the features on this rule, adding them to the feature vector. Unlabeled features
+ * are converted to a canonical form.
+ *
+ * Note that it's bad form to mix unlabeled features and the named feature index they are mapped
+ * to, but we are being liberal in what we accept.
+ *
+ * IMPORTANT: Note that, for historical reasons, the sign is reversed on all *dense* scores.
+ * This is the source of *no end* of confusion and should be done away with.
+ */
+ this();
+
+ int denseFeatureIndex = 0;
+
+ if (!featureString.trim().equals("")) {
+ for (String token : featureString.split("\\s+")) {
+ if (token.indexOf('=') == -1) {
+ /*
+ * If we encounter an unlabeled feature, it is the next dense feature
+ */
+ while (denseFeatures.size() <= denseFeatureIndex)
+ denseFeatures.add(0.0f);
+ denseFeatures.set(denseFeatureIndex, -Float.parseFloat(token));
+ denseFeatureIndex++;
+ } else {
+ /*
+ * Labeled features are of two types: if they start with the prefix, they are actually
+ * dense feature in disguise; otherwise, they are proper sparse features.
+ */
+ int splitPoint = token.indexOf('=');
+ if (token.startsWith(prefix)) {
+// System.err.println(String.format(" PREFIX=%s '%s'.substring(%d,%d) = %s", prefix, token, prefix.length(), splitPoint,
+// token.substring(prefix.length(), splitPoint)));
+ int index = Integer.parseInt(token.substring(prefix.length(), splitPoint));
+ while (denseFeatures.size() <= index)
+ denseFeatures.add(0.0f);
+ denseFeatures.set(index, 1.0f * Float.parseFloat(token.substring(splitPoint + 1)));
+ } else {
+ sparseFeatures.put(token.substring(0, splitPoint),
+ Float.parseFloat(token.substring(splitPoint + 1)));
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Register one or more dense features with the global weight vector. This assumes them global
+ * IDs, and then returns the index of the first feature (from which the calling feature function
+ * can infer them all). This *must* be called by every feature function wishing to register
+ * dense features!
+ *
+ * @param featureFunctions {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void registerDenseFeatures(ArrayList<FeatureFunction> featureFunctions) {
+ for (FeatureFunction feature: featureFunctions) {
+ ArrayList<String> names = feature.reportDenseFeatures(denseFeatures.size());
+ for (String name: names) {
+ DENSE_FEATURE_NAMES.add(name);
+ denseFeatures.add(getSparse(name));
+ sparseFeatures.remove(name);
+ }
+ }
+ }
+
+ public ArrayList<Float> getDenseFeatures() {
+ return denseFeatures;
+ }
+
+ public HashMap<String,Float> getSparseFeatures() {
+ return sparseFeatures;
+ }
+
+ public Set<String> keySet() {
+ return sparseFeatures.keySet();
+ }
+
+ public int size() {
+ return sparseFeatures.size() + denseFeatures.size();
+ }
+
+ public FeatureVector clone() {
+ FeatureVector newOne = new FeatureVector();
+ for (String key : this.sparseFeatures.keySet())
+ newOne.set(key, this.sparseFeatures.get(key));
+ for (int i = 0; i < denseFeatures.size(); i++)
+ newOne.set(i, getDense(i));
+ return newOne;
+ }
+
+ /**
+ * Subtracts the weights in the other feature vector from this one. Note that this is not set
+ * subtraction; keys found in the other FeatureVector but not in this one will be initialized with
+ * a value of 0.0f before subtraction.
+ *
+ * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to subtract its score
+ */
+ public void subtract(FeatureVector other) {
+ for (int i = 0; i < denseFeatures.size(); i++)
+ denseFeatures.set(i, getDense(i) - other.getDense(i));
+
+ for (String key : other.keySet()) {
+ float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f;
+ sparseFeatures.put(key, oldValue - other.getSparse(key));
+ }
+ }
+
+ /**
+ * Adds the weights in the other feature vector to this one. This is set union, with values shared
+ * between the two being summed.
+ *
+ * @param other another {@link org.apache.joshua.decoder.ff.FeatureVector} from which to add its score
+ */
+ public void add(FeatureVector other) {
+ while (denseFeatures.size() < other.denseFeatures.size())
+ denseFeatures.add(0.0f);
+
+ for (int i = 0; i < other.denseFeatures.size(); i++)
+ increment(i, other.getDense(i));
+
+ for (String key : other.keySet()) {
+ if (!sparseFeatures.containsKey(key))
+ sparseFeatures.put(key, other.getSparse(key));
+ else
+ sparseFeatures.put(key, sparseFeatures.get(key) + other.getSparse(key));
+ }
+ }
+
+ /**
+ * Return the weight of a feature by name, after checking to determine if it is sparse or dense.
+ *
+ * @param feature String name of some feature
+ * @return the feature's weight
+ */
+ public float getWeight(String feature) {
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ if (DENSE_FEATURE_NAMES.get(i).equals(feature)) {
+ return getDense(i);
+ }
+ }
+ return getSparse(feature);
+ }
+
+ /**
+ * Return the weight of a sparse feature, indexed by its name.
+ *
+ * @param feature String name of some feature
+ * @return the sparse feature's weight, or 0 if not found.
+ */
+ public float getSparse(String feature) {
+ if (sparseFeatures.containsKey(feature))
+ return sparseFeatures.get(feature);
+ return 0.0f;
+ }
+
+ public boolean hasValue(String name) {
+ return sparseFeatures.containsKey(name);
+ }
+
+ /**
+ * Return the weight of a dense feature, indexed by its feature index, or 0.0f, if the feature
+ * is not found. In other words, this is a safe way to query the dense feature vector.
+ *
+ * @param id int representing of some dense feature
+ * @return the dense feature's value, or 0 if not found.
+ */
+ public float getDense(int id) {
+ if (id < denseFeatures.size())
+ return denseFeatures.get(id);
+ return 0.0f;
+ }
+
+ public void increment(String feature, float value) {
+ sparseFeatures.put(feature, getSparse(feature) + value);
+ }
+
+ public void increment(int id, float value) {
+ while (id >= denseFeatures.size())
+ denseFeatures.add(0.0f);
+ denseFeatures.set(id, getDense(id) + value);
+ }
+
+ /**
+ * Set the value of a feature. We need to first determine whether the feature is a dense or
+ * sparse one, then set accordingly.
+ *
+ * @param feature String name of some feature
+ * @param value float value to set to the featue with the associated name
+ */
+ public void set(String feature, float value) {
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ if (DENSE_FEATURE_NAMES.get(i).equals(feature)) {
+ denseFeatures.set(i, value);
+ return;
+ }
+ }
+ // No dense feature was found; assume it's sparse
+ sparseFeatures.put(feature, value);
+ }
+
+ public void set(int id, float value) {
+ while (id >= denseFeatures.size())
+ denseFeatures.add(0.0f);
+ denseFeatures.set(id, value);
+ }
+
+ public Map<String, Float> getMap() {
+ Map<String, Float> allFeatures = new HashMap<>(sparseFeatures.size() + denseFeatures.size());
+ allFeatures.putAll(sparseFeatures);
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ allFeatures.put(DENSE_FEATURE_NAMES.get(i), getDense(i));
+ }
+ return allFeatures;
+ }
+
+ /**
+ * Computes the inner product between this feature vector and another one.
+ *
+ * @param other a {@link org.apache.joshua.decoder.ff.FeatureVector} with which to compute the inner product
+ * @return float value representing the computation
+ */
+ public float innerProduct(FeatureVector other) {
+ float cost = 0.0f;
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++)
+ cost += getDense(i) * other.getDense(i);
+
+ for (String key : sparseFeatures.keySet())
+ cost += sparseFeatures.get(key) * other.getSparse(key);
+
+ return cost;
+ }
+
+ public void times(float value) {
+ for (String key : sparseFeatures.keySet())
+ sparseFeatures.put(key, sparseFeatures.get(key) * value);
+ }
+
+ /***
+ * Moses distinguishes sparse features as those containing an underscore, so we have to fake it
+ * to be compatible with their tuners.
+ *
+ * @return trimmed Moses output string
+ */
+ public String mosesString() {
+ StringBuilder outputString = new StringBuilder();
+
+ HashSet<String> printed_keys = new HashSet<String>();
+
+ // First print all the dense feature names in order
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), getDense(i)));
+ printed_keys.add(DENSE_FEATURE_NAMES.get(i));
+ }
+
+ // Now print the sparse features
+ ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
+ Collections.sort(keys);
+ for (String key: keys) {
+ if (! printed_keys.contains(key)) {
+ float value = sparseFeatures.get(key);
+ if (key.equals("OOVPenalty"))
+ // force moses to see it as sparse
+ key = "OOV_Penalty";
+ outputString.append(String.format("%s=%.3f ", key, value));
+ }
+ }
+ return outputString.toString().trim();
+ }
+
+ /***
+ * Outputs a list of feature names. All dense features are printed. Feature names are printed
+ * in the order they were read in.
+ */
+ @Override
+ public String toString() {
+ StringBuilder outputString = new StringBuilder();
+
+ HashSet<String> printed_keys = new HashSet<String>();
+
+ // First print all the dense feature names in order
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i)));
+ printed_keys.add(DENSE_FEATURE_NAMES.get(i));
+ }
+
+ // Now print the rest of the features
+ ArrayList<String> keys = new ArrayList<String>(sparseFeatures.keySet());
+ Collections.sort(keys);
+ for (String key: keys)
+ if (! printed_keys.contains(key))
+ outputString.append(String.format("%s=%.3f ", key, sparseFeatures.get(key)));
+
+ return outputString.toString().trim();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
new file mode 100644
index 0000000..bfebaa5
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+/***
+ * @author Gideon Wenniger
+ */
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+public class LabelCombinationFF extends StatelessFF {
+
+ public LabelCombinationFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "LabelCombination", args, config);
+ }
+
+ public String getLowerCasedFeatureName() {
+ return name.toLowerCase();
+ }
+
+ private final String computeRuleLabelCombinationDescriptor(Rule rule) {
+ StringBuilder result = new StringBuilder(getLowerCasedFeatureName() + "_");
+ result.append(RulePropertiesQuerying.getLHSAsString(rule));
+ // System.out.println("Rule: " + rule);
+ for (String foreignNonterminalString : RulePropertiesQuerying.getRuleSourceNonterminalStrings(rule)) {
+ result.append("_").append(foreignNonterminalString);
+ }
+ return result.toString();
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ if (rule != null)
+ acc.add(computeRuleLabelCombinationDescriptor(rule), 1);
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
new file mode 100644
index 0000000..8735be6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+/***
+ * @author Gideon Wenniger
+ */
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.ListUtil;
+
+public class LabelSubstitutionFF extends StatelessFF {
+ private static final String MATCH_SUFFIX = "MATCH";
+ private static final String NO_MATCH_SUFFIX = "NOMATCH";
+
+ public LabelSubstitutionFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "LabelSubstitution", args, config);
+ }
+
+ public String getLowerCasedFeatureName() {
+ return name.toLowerCase();
+ }
+
+ public String getMatchFeatureSuffix(String ruleNonterminal, String substitutionNonterminal) {
+ if (ruleNonterminal.equals(substitutionNonterminal)) {
+ return MATCH_SUFFIX;
+ } else {
+ return NO_MATCH_SUFFIX;
+ }
+ }
+
+ public static String getSubstitutionSuffix(String ruleNonterminal, String substitutionNonterminal) {
+ return substitutionNonterminal + "_substitutes_" + ruleNonterminal;
+ }
+
+ private final String computeLabelMatchingFeature(String ruleNonterminal,
+ String substitutionNonterminal) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getMatchFeatureSuffix(ruleNonterminal, substitutionNonterminal);
+ return result;
+ }
+
+ private final String computeLabelSubstitutionFeature(String ruleNonterminal,
+ String substitutionNonterminal) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getSubstitutionSuffix(ruleNonterminal, substitutionNonterminal);
+ return result;
+ }
+
+ private static final String getRuleLabelsDescriptorString(Rule rule) {
+ String result = "";
+ String leftHandSide = RulePropertiesQuerying.getLHSAsString(rule);
+ List<String> ruleSourceNonterminals = RulePropertiesQuerying
+ .getRuleSourceNonterminalStrings(rule);
+ boolean isInverting = rule.isInverting();
+ result += "<LHS>" + leftHandSide + "</LHS>";
+ result += "_<Nont>";
+ result += ListUtil.stringListStringWithoutBracketsCommaSeparated(ruleSourceNonterminals);
+ result += "</Nont>";
+ if(isInverting)
+ {
+ result += "_INV";
+ }
+ else
+ {
+ result += "_MONO";
+ }
+
+ return result;
+ }
+
+ private static final String getSubstitutionsDescriptorString(List<HGNode> tailNodes) {
+ String result = "_<Subst>";
+ List<String> substitutionNonterminals = RulePropertiesQuerying
+ .getSourceNonterminalStrings(tailNodes);
+ result += ListUtil.stringListStringWithoutBracketsCommaSeparated(substitutionNonterminals);
+ result += "</Subst>";
+ return result;
+ }
+
+ public final String getGapLabelsForRuleSubstitutionSuffix(Rule rule, List<HGNode> tailNodes) {
+ String result = getLowerCasedFeatureName() + "_";
+ result += getRuleLabelsDescriptorString(rule);
+ result += getSubstitutionsDescriptorString(tailNodes);
+ return result;
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ if (rule != null && (tailNodes != null)) {
+
+ List<String> ruleSourceNonterminals = RulePropertiesQuerying
+ .getRuleSourceNonterminalStrings(rule);
+ List<String> substitutionNonterminals = RulePropertiesQuerying
+ .getSourceNonterminalStrings(tailNodes);
+ // Assert.assertEquals(ruleSourceNonterminals.size(), substitutionNonterminals.size());
+ for (int nonterinalIndex = 0; nonterinalIndex < ruleSourceNonterminals.size(); nonterinalIndex++) {
+ String ruleNonterminal = ruleSourceNonterminals.get(nonterinalIndex);
+ String substitutionNonterminal = substitutionNonterminals.get(nonterinalIndex);
+ acc.add(computeLabelMatchingFeature(ruleNonterminal, substitutionNonterminal), 1);
+ acc.add(computeLabelSubstitutionFeature(ruleNonterminal, substitutionNonterminal), 1);
+ }
+ acc.add(getGapLabelsForRuleSubstitutionSuffix(rule, tailNodes), 1);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
new file mode 100644
index 0000000..58de5f4
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/LexicalFeatures.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FormatUtils;
+
+import com.google.common.cache.Cache;
+
+/**
+ * Lexical alignment features denoting alignments, deletions, and insertions.
+ */
+public class LexicalFeatures extends StatelessFF {
+
+ private final boolean useAlignments;
+ private final boolean useDeletions;
+ private final boolean useInsertions;
+
+ private static final String NAME = "LexicalFeatures";
+ // value to fire for features
+ private static final int VALUE = 1;
+ //whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final int owner;
+ // Strings separating words
+ private static final String SEPARATOR = "~";
+
+ private final Cache<Rule, List<String>> featureCache;
+
+ public LexicalFeatures(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
+ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+ owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+
+ useAlignments = parsedArgs.containsKey("alignments");
+ useDeletions = parsedArgs.containsKey("deletions");
+ useInsertions = parsedArgs.containsKey("insertions");
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (ownerRestriction && rule.getOwner() != owner) {
+ return null;
+ }
+
+ List<String> featureNames = featureCache.getIfPresent(rule);
+ if (featureNames == null) {
+ featureNames = getFeatures(rule);
+ featureCache.put(rule, featureNames);
+ }
+ for (String feature : featureNames) {
+ acc.add(feature, VALUE);
+ }
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature ids for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private List<String> getFeatures(final Rule rule) {
+ final List<String> result = new ArrayList<>();
+
+ byte[] alignments = rule.getAlignment();
+ if (alignments == null) {
+ return result;
+ }
+ int[] sourceWords = rule.getFrench();
+ int[] targetWords = rule.getEnglish();
+
+ // sourceAligned & targetAligned indicate whether an index is covered by alignments
+ boolean[] sourceAligned = new boolean[sourceWords.length];
+ boolean[] targetAligned = new boolean[targetWords.length];
+
+ // translations: aligned words
+ for (int i = 0; i < alignments.length; i+=2) {
+ byte sourceIndex = alignments[i];
+ byte targetIndex = alignments[i + 1];
+ sourceAligned[sourceIndex] = true;
+ targetAligned[targetIndex] = true;
+ if (useAlignments) {
+ result.add(
+ "T:" +
+ Vocabulary.word(sourceWords[sourceIndex]) +
+ SEPARATOR +
+ Vocabulary.word(targetWords[targetIndex]));
+ }
+ }
+
+ // deletions: unaligned source words
+ if (useDeletions) {
+ for (int i = 0; i < sourceAligned.length; i++) {
+ if (!sourceAligned[i] && ! FormatUtils.isNonterminal(sourceWords[i])) {
+ result.add("D:" + Vocabulary.word(sourceWords[i]));
+ }
+ }
+ }
+
+ // insertions: unaligned target words
+ if (useInsertions) {
+ for (int i = 0; i < targetAligned.length; i++) {
+ if (useInsertions && !targetAligned[i] && ! FormatUtils.isNonterminal(targetWords[i])) {
+ result.add("I:" + Vocabulary.word(targetWords[i]));
+ }
+ }
+ }
+
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
new file mode 100644
index 0000000..5278172
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+
+/**
+ * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
+ * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
+ * with respect to the translation model, we create a rule that pushes that word through
+ * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
+ * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
+ * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class OOVPenalty extends StatelessFF {
+ private final int ownerID;
+
+ /* The default value returned for OOVs. Can be overridden with -oov-list */
+ private final float defaultValue = -100f;
+ private final HashMap<Integer,Float> oovWeights;
+
+ public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "OOVPenalty", args, config);
+
+ ownerID = Vocabulary.id("oov");
+ oovWeights = new HashMap<Integer,Float>();
+
+ if (config.oovList != null) {
+ for (OOVItem item: config.oovList) {
+ oovWeights.put(Vocabulary.id(item.label), item.weight);
+ }
+ }
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<>(1);
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
+ * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
+ * cached when the feature was created.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && this.ownerID == rule.getOwner()) {
+ acc.add(denseFeatureIndex, getValue(rule.getLHS()));
+ }
+
+ return null;
+ }
+
+ /**
+ * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
+ * rules (which are added for all words, not just ones without translation options) get sorted
+ * to the bottom during cube pruning.
+ *
+ * Important! estimateCost returns the *weighted* feature value.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && this.ownerID == rule.getOwner())
+ return weights.getDense(denseFeatureIndex) * getValue(rule.getLHS());
+ return 0.0f;
+ }
+
+ private float getValue(int lhs) {
+ return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
new file mode 100644
index 0000000..2324292
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature handles the list of features that are found with grammar rules in the grammar file.
+ * dense features that may be associated with the rules in a grammar file. The feature names of
+ * these dense rules are a function of the phrase model owner. When the feature is loaded, it
+ * queries the weights for the set of features that are active for this grammar, storing them in an
+ * array.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Zhifei Li zhifei.work@gmail.com
+ */
+
+public class PhraseModel extends StatelessFF {
+
+ /* The owner of the grammar. */
+ private int ownerID;
+ private String owner;
+
+ private float[] phrase_weights = null;
+
+ public PhraseModel(FeatureVector weights, String[] args, JoshuaConfiguration config, Grammar g) {
+ super(weights, "tm_", args, config);
+
+ String owner = parsedArgs.get("owner");
+ this.name = String.format("tm_%s", owner);
+
+ /*
+ * Determine the number of features by querying the example grammar that was passed in.
+ */
+ phrase_weights = new float[g.getNumDenseFeatures()];
+// System.err.println(String.format("GOT %d FEATURES FOR %s", g.getNumDenseFeatures(), owner));
+ for (int i = 0; i < phrase_weights.length; i++)
+ phrase_weights[i] = weights.getSparse(String.format("tm_%s_%d", owner, i));
+
+ // Store the owner.
+ this.owner = owner;
+ this.ownerID = Vocabulary.id(owner);
+ }
+
+ /**
+ * Just register a single weight, tm_OWNER, and use that to set its precomputed cost
+ */
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<String>();
+ for (int i = 0; i < phrase_weights.length; i++)
+ names.add(String.format("tm_%s_%d", owner, i));
+ return names;
+ }
+
+ /**
+ * Estimates the cost of applying this rule, which is just the score of the precomputable feature
+ * functions.
+ */
+ @Override
+ public float estimateCost(final Rule rule, Sentence sentence) {
+
+ if (rule != null && rule.getOwner() == ownerID) {
+ if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY)
+ rule.setPrecomputableCost(phrase_weights, weights);
+
+ return rule.getPrecomputableCost();
+ }
+
+ return 0.0f;
+ }
+
+ /**
+ * Just chain to computeFeatures(rule), since this feature doesn't use the sourcePath or sentID. *
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && rule.getOwner() == ownerID) {
+ /*
+ * Here, we peak at the Accumulator object. If it's asking for scores, then we don't bother to
+ * add each feature, but rather compute the inner product and add *that*. This is totally
+ * cheating; the Accumulator is supposed to be a generic object. But without this cheat
+ */
+ if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY) {
+ // float score = rule.getFeatureVector().innerProduct(weights);
+ rule.setPrecomputableCost(phrase_weights, weights);
+ }
+
+// System.err.println(String.format("RULE = %s / %f", rule.getEnglishWords(), rule.getPrecomputableCost()));
+ for (int k = 0; k < phrase_weights.length; k++) {
+// System.err.println(String.format("k = %d, denseFeatureIndex = %d, owner = %s, ownerID = %d", k, denseFeatureIndex, owner, ownerID));
+ acc.add(k + denseFeatureIndex, rule.getDenseFeature(k));
+ }
+
+ for (String key: rule.getFeatureVector().keySet())
+ acc.add(key, rule.getFeatureVector().getSparse(key));
+ }
+
+ return null;
+ }
+
+ public String toString() {
+ return name + " " + Vocabulary.word(ownerID);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
new file mode 100644
index 0000000..3c38e60
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature just counts rules that are used. You can restrict it with a number of flags:
+ *
+ * -owner OWNER
+ * Only count rules owned by OWNER
+ * -target|-source
+ * Only count the target or source side (plus the LHS)
+ *
+ * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold.
+ */
+public class PhrasePenalty extends StatelessFF {
+
+ private int owner = 0;
+ private float value = 1.0f;
+
+ public PhrasePenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "PhrasePenalty", args, config);
+ if (parsedArgs.containsKey("owner"))
+ this.owner = Vocabulary.id(parsedArgs.get("owner"));
+ else // default
+ this.owner = Vocabulary.id("pt");
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
+ && (owner == 0 || rule.getOwner() == owner))
+ acc.add(denseFeatureIndex, value);
+
+ return null;
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+ ArrayList<String> names = new ArrayList<String>();
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * Returns the *weighted* estimate.
+ *
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
+ && (owner == 0 || rule.getOwner() == owner))
+ return weights.getDense(denseFeatureIndex) * value;
+ return 0.0f;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
new file mode 100644
index 0000000..5ba0c66
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * This feature computes a bin for the rule and activates a feature for it. It requires access to
+ * the index of the RarityPenalty field, from which the rule count can be computed.
+ */
+public class RuleCountBin extends StatelessFF {
+
+ private static final Logger LOG = LoggerFactory.getLogger(RuleCountBin.class);
+ private int field = -1;
+
+ public RuleCountBin(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleCountBin", args, config);
+
+ field = Integer.parseInt(parsedArgs.get("field"));
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule.getOwner() != Vocabulary.id("pt"))
+ return null;
+
+ float rarityPenalty = -rule.getFeatureVector().getSparse(String.format("tm_pt_%d", field));
+ int count = (int) (1.0 - Math.log(rarityPenalty));
+
+ String feature = "RuleCountBin_inf";
+
+ int[] bins = { 1, 2, 4, 8, 16, 32, 64, 128, 1000, 10000 };
+ for (int k : bins) {
+ if (count <= k) {
+ feature = String.format("RuleCountBin_%d", k);
+ break;
+ }
+ }
+
+ LOG.debug("RuleCountBin({}) = {} ==> {}", rarityPenalty, count, feature);
+
+ acc.add(feature, 1.0f);
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
new file mode 100644
index 0000000..909e481
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import static com.google.common.cache.CacheBuilder.newBuilder;
+
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+import com.google.common.cache.Cache;
+
+/**
+ * This feature fires for rule ids.
+ * Firing can be restricted to rules from a certain owner, and rule ids
+ * can be generated from source side and/or target side.
+ */
+public class RuleFF extends StatelessFF {
+
+ private enum Sides { SOURCE, TARGET, BOTH };
+
+ private static final String NAME = "RuleFF";
+ // value to fire for features
+ private static final int VALUE = 1;
+ // whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
+ private final int owner;
+ // what part of the rule should be extracted;
+ private final Sides sides;
+ // Strings separating words and rule sides
+ private static final String SEPARATOR = "~";
+ private static final String SIDES_SEPARATOR = "->";
+
+ private final Cache<Rule, String> featureCache;
+
+ public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
+ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
+ owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
+
+ if (parsedArgs.containsKey("sides")) {
+ final String sideValue = parsedArgs.get("sides");
+ if (sideValue.equalsIgnoreCase("source")) {
+ sides = Sides.SOURCE;
+ } else if (sideValue.equalsIgnoreCase("target")) {
+ sides = Sides.TARGET;
+ } else if (sideValue.equalsIgnoreCase("both")){
+ sides = Sides.BOTH;
+ } else {
+ throw new RuntimeException("Unknown side value.");
+ }
+ } else {
+ sides = Sides.BOTH;
+ }
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (ownerRestriction && rule.getOwner() != owner) {
+ return null;
+ }
+
+ String featureName = featureCache.getIfPresent(rule);
+ if (featureName == null) {
+ featureName = getRuleString(rule);
+ featureCache.put(rule, featureName);
+ }
+ acc.add(featureName, VALUE);
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature id for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private String getRuleString(final Rule rule) {
+ final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+ .append(SIDES_SEPARATOR);
+ if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getFrench(), SEPARATOR));
+ }
+ sb.append(SIDES_SEPARATOR);
+ if (sides == Sides.TARGET || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getEnglish(), SEPARATOR));
+ }
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
new file mode 100644
index 0000000..02c520b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/*
+ * This feature computes three feature templates: a feature indicating the length of the rule's
+ * source side, its target side, and a feature that pairs them.
+ */
+public abstract class RuleLength extends StatelessFF {
+
+ private static final int VALUE = 1;
+
+ public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleLength", args, config);
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ int sourceLength = rule.getFrench().length;
+ int targetLength = rule.getEnglish().length;
+ acc.add(name + "_source" + sourceLength, VALUE);
+ acc.add(name + "_target" + sourceLength, VALUE);
+ acc.add(name + "_sourceTarget" + sourceLength + "-" + targetLength, VALUE);
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
new file mode 100644
index 0000000..a1867a3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RulePropertiesQuerying.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+
+public class RulePropertiesQuerying {
+
+ public static final String getLHSAsString(Rule rule) {
+ return Vocabulary.word(rule.getLHS());
+ }
+
+ public static List<String> getRuleSourceNonterminalStrings(Rule rule) {
+ List<String> result = new ArrayList<String>();
+ for (int nonTerminalIndex : rule.getForeignNonTerminals()) {
+ result.add(Vocabulary.word(nonTerminalIndex));
+ }
+ return result;
+ }
+
+ public static List<String> getSourceNonterminalStrings(List<HGNode> tailNodes) {
+ List<String> result = new ArrayList<String>();
+ for (HGNode tailNode : tailNodes) {
+ result.add(Vocabulary.word(tailNode.lhs));
+ }
+ return result;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
new file mode 100644
index 0000000..8483ad6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FormatUtils;
+
+/*
+ * Implements the RuleShape feature for source, target, and paired source+target sides.
+ */
+public class RuleShape extends StatelessFF {
+
+ public RuleShape(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleShape", args, config);
+ }
+
+ private enum WordType {
+ N("N"), T("x"), P("+");
+ private final String string;
+ private boolean repeats;
+
+ private WordType(final String string) {
+ this.string = string;
+ this.repeats = false;
+ }
+
+ private void setRepeats() {
+ repeats = true;
+ }
+
+ @Override
+ public String toString() {
+ if (repeats) {
+ return this.string + "+";
+ }
+ return this.string;
+ }
+ }
+
+ private WordType getWordType(int id) {
+ if (FormatUtils.isNonterminal(id)) {
+ return WordType.N;
+ } else {
+ return WordType.T;
+ }
+ }
+
+ /**
+ * Returns a String describing the rule pattern.
+ */
+ private String getRulePattern(int[] ids) {
+ final StringBuilder pattern = new StringBuilder();
+ WordType currentType = getWordType(ids[0]);
+ for (int i = 1; i < ids.length; i++) {
+ if (getWordType(ids[i]) != currentType) {
+ pattern.append(currentType.toString());
+ currentType = getWordType(ids[i]);
+ } else {
+ currentType.setRepeats();
+ }
+ }
+ pattern.append(currentType.toString());
+ return pattern.toString();
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i_, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+ final String sourceShape = getRulePattern(rule.getFrench());
+ final String targetShape = getRulePattern(rule.getEnglish());
+ acc.add(name + "_source_" + sourceShape, 1);
+ acc.add(name + "_target_" + sourceShape, 1);
+ acc.add(name + "_sourceTarget_" + sourceShape + "_" + targetShape, 1);
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
new file mode 100644
index 0000000..841402a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+public interface SourceDependentFF extends Cloneable {
+
+ public void setSource(Sentence sentence);
+
+ public FeatureFunction clone();
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
new file mode 100644
index 0000000..b138426
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * This feature returns the scored path through the source lattice, which is recorded in a
+ * SourcePath object.
+ *
+ * @author Chris Dyer redpony@umd.edu
+ * @author Matt Post post@cs.jhu.edu
+ */
+public final class SourcePathFF extends StatelessFF {
+
+ /*
+ * This is a single-value feature template, so we cache the weight here.
+ */
+ public SourcePathFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "SourcePath", args, config);
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<String>();
+ names.add(name);
+ return names;
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ acc.add(denseFeatureIndex, sourcePath.getPathCost());
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java
new file mode 100644
index 0000000..1f5d0ed
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatefulFF.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Stateful features contribute dynamic programming state. Unlike earlier versions of Joshua, the
+ * stateful feature itself is responsible for computing and return its updated state. Each
+ * state-computing feature function is assigned a global index, which is used to index the list of
+ * state-contributing objects in each HGNode. State can no longer be shared among different feature
+ * functions.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevich juri@cs.jhu.edu
+ */
+public abstract class StatefulFF extends FeatureFunction {
+
+ private static final Logger LOG = LoggerFactory.getLogger(StatefulFF.class);
+ /* Every stateful FF takes a unique index value and increments this. */
+ static int GLOBAL_STATE_INDEX = 0;
+
+ /* This records the state index for each instantiated stateful feature function. */
+ protected int stateIndex = 0;
+
+ public StatefulFF(FeatureVector weights, String name, String[] args, JoshuaConfiguration config) {
+ super(weights, name, args, config);
+
+ LOG.info("Stateful object with state index {}", GLOBAL_STATE_INDEX);
+ stateIndex = GLOBAL_STATE_INDEX++;
+ }
+
+ public static void resetGlobalStateIndex() {
+ GLOBAL_STATE_INDEX = 0;
+ }
+
+ public final boolean isStateful() {
+ return true;
+ }
+
+ public final int getStateIndex() {
+ return stateIndex;
+ }
+
+ /**
+ * Function computing the features that this function fires when a rule is applied. Must return
+ * its updated DPState. The accumulator is used to record every feature that fires.
+ */
+ @Override
+ public abstract DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence, Accumulator acc);
+
+ @Override
+ public abstract DPState computeFinal(HGNode tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc);
+
+ /**
+ * Computes an estimated future cost of this rule. Note that this is not compute as part of the
+ * score but is used for pruning.
+ */
+ @Override
+ public abstract float estimateFutureCost(Rule rule, DPState state, Sentence sentence);
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java
new file mode 100644
index 0000000..e473c37
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/StatelessFF.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
+ * Stateless feature functions do not contribute any state. You need not implement this class to
+ * create a stateless feature function, but it provides a few convenience functions.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevich juri@cs.jhu.edu
+ */
+
+public abstract class StatelessFF extends FeatureFunction {
+
+ public StatelessFF(FeatureVector weights, String name, String[] args, JoshuaConfiguration config) {
+ super(weights, name, args, config);
+ }
+
+ public final boolean isStateful() {
+ return false;
+ }
+
+ /**
+ * The estimated cost of applying this feature, given only the rule. This is used in sorting the
+ * rules for cube pruning. For most features, this will be 0.0.
+ */
+ public float estimateCost(Rule rule, Sentence sentence) {
+ return 0.0f;
+ }
+
+ /**
+ * Implementations of this should return null, since no state is contributed.
+ */
+ @Override
+ public abstract DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
+ SourcePath sourcePath, Sentence sentence, Accumulator acc);
+
+ /**
+ * Implementations of this should return null, since no state is contributed.
+ */
+ @Override
+ public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
+ Accumulator acc) {
+ return null;
+ }
+
+ /**
+ * Stateless functions do not have an estimate of the future cost because they do not have access
+ * to the state.
+ */
+ public final float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
+ return 0.0f;
+ }
+}