You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:46:09 UTC
[58/60] incubator-joshua git commit: Merge branch
'maven-multi-module' of https://github.com/logogin/incubator-joshua into
maven-multi-module
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 0000000,5278172..92ee740
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@@ -1,0 -1,106 +1,108 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff;
+
+ import java.util.ArrayList;
+ import java.util.HashMap;
+ import java.util.List;
+
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+
+ /**
+ * This feature is fired when an out-of-vocabulary word (with respect to the translation model) is
+ * entered into the chart. OOVs work in the following manner: for each word in the input that is OOV
+ * with respect to the translation model, we create a rule that pushes that word through
+ * untranslated (the suffix "_OOV" can optionally be appended according to the runtime parameter
+ * "mark-oovs") . These rules are all stored in a grammar whose owner is "oov". The OOV feature
+ * function template then fires the "OOVPenalty" feature whenever it is asked to score an OOV rule.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public class OOVPenalty extends StatelessFF {
- private final int ownerID;
++ private final OwnerId ownerID;
+
+ /* The default value returned for OOVs. Can be overridden with -oov-list */
+ private final float defaultValue = -100f;
+ private final HashMap<Integer,Float> oovWeights;
+
+ public OOVPenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "OOVPenalty", args, config);
+
- ownerID = Vocabulary.id("oov");
++ ownerID = OwnerMap.register("oov");
+ oovWeights = new HashMap<Integer,Float>();
+
+ if (config.oovList != null) {
+ for (OOVItem item: config.oovList) {
+ oovWeights.put(Vocabulary.id(item.label), item.weight);
+ }
+ }
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<>(1);
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * OOV rules cover exactly one word, and such rules belong to a grammar whose owner is "oov". Each
+ * OOV fires the OOVPenalty feature with a value of 1, so the cost is simply the weight, which was
+ * cached when the feature was created.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
- if (rule != null && this.ownerID == rule.getOwner()) {
++ if (rule != null && this.ownerID.equals(rule.getOwner())) {
+ acc.add(denseFeatureIndex, getValue(rule.getLHS()));
+ }
+
+ return null;
+ }
+
+ /**
+ * It's important for the OOV feature to contribute to the rule's estimated cost, so that OOV
+ * rules (which are added for all words, not just ones without translation options) get sorted
+ * to the bottom during cube pruning.
+ *
+ * Important! estimateCost returns the *weighted* feature value.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
- if (rule != null && this.ownerID == rule.getOwner())
++ if (rule != null && this.ownerID.equals(rule.getOwner()))
+ return weights.getDense(denseFeatureIndex) * getValue(rule.getLHS());
+ return 0.0f;
+ }
+
+ private float getValue(int lhs) {
+ return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
index 0000000,2324292..7ae3dbc
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
@@@ -1,0 -1,135 +1,134 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff;
+
+ import java.util.ArrayList;
+ import java.util.List;
+
-import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.tm.Grammar;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+
+ /**
+ * This feature handles the list of features that are found with grammar rules in the grammar file.
+ * dense features that may be associated with the rules in a grammar file. The feature names of
+ * these dense rules are a function of the phrase model owner. When the feature is loaded, it
+ * queries the weights for the set of features that are active for this grammar, storing them in an
+ * array.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Zhifei Li zhifei.work@gmail.com
+ */
+
+ public class PhraseModel extends StatelessFF {
+
+ /* The owner of the grammar. */
- private int ownerID;
- private String owner;
++ private final OwnerId ownerID;
++ private final String owner;
+
+ private float[] phrase_weights = null;
+
+ public PhraseModel(FeatureVector weights, String[] args, JoshuaConfiguration config, Grammar g) {
+ super(weights, "tm_", args, config);
+
- String owner = parsedArgs.get("owner");
- this.name = String.format("tm_%s", owner);
++ // Store the owner and name
++ this.owner = parsedArgs.get("owner");
++ this.ownerID = OwnerMap.register(owner);
++ this.name = String.format("tm_%s", this.owner);
+
+ /*
+ * Determine the number of features by querying the example grammar that was passed in.
+ */
+ phrase_weights = new float[g.getNumDenseFeatures()];
-// System.err.println(String.format("GOT %d FEATURES FOR %s", g.getNumDenseFeatures(), owner));
+ for (int i = 0; i < phrase_weights.length; i++)
+ phrase_weights[i] = weights.getSparse(String.format("tm_%s_%d", owner, i));
-
- // Store the owner.
- this.owner = owner;
- this.ownerID = Vocabulary.id(owner);
++
+ }
+
+ /**
+ * Just register a single weight, tm_OWNER, and use that to set its precomputed cost
+ */
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<String>();
+ for (int i = 0; i < phrase_weights.length; i++)
+ names.add(String.format("tm_%s_%d", owner, i));
+ return names;
+ }
+
+ /**
+ * Estimates the cost of applying this rule, which is just the score of the precomputable feature
+ * functions.
+ */
+ @Override
+ public float estimateCost(final Rule rule, Sentence sentence) {
+
- if (rule != null && rule.getOwner() == ownerID) {
++ if (rule != null && rule.getOwner().equals(ownerID)) {
+ if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY)
+ rule.setPrecomputableCost(phrase_weights, weights);
+
+ return rule.getPrecomputableCost();
+ }
+
+ return 0.0f;
+ }
+
+ /**
+ * Just chain to computeFeatures(rule), since this feature doesn't use the sourcePath or sentID. *
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
- if (rule != null && rule.getOwner() == ownerID) {
++ if (rule != null && rule.getOwner().equals(ownerID)) {
+ /*
+ * Here, we peak at the Accumulator object. If it's asking for scores, then we don't bother to
+ * add each feature, but rather compute the inner product and add *that*. This is totally
+ * cheating; the Accumulator is supposed to be a generic object. But without this cheat
+ */
+ if (rule.getPrecomputableCost() <= Float.NEGATIVE_INFINITY) {
+ // float score = rule.getFeatureVector().innerProduct(weights);
+ rule.setPrecomputableCost(phrase_weights, weights);
+ }
+
+ // System.err.println(String.format("RULE = %s / %f", rule.getEnglishWords(), rule.getPrecomputableCost()));
+ for (int k = 0; k < phrase_weights.length; k++) {
+ // System.err.println(String.format("k = %d, denseFeatureIndex = %d, owner = %s, ownerID = %d", k, denseFeatureIndex, owner, ownerID));
+ acc.add(k + denseFeatureIndex, rule.getDenseFeature(k));
+ }
+
+ for (String key: rule.getFeatureVector().keySet())
+ acc.add(key, rule.getFeatureVector().getSparse(key));
+ }
+
+ return null;
+ }
+
+ public String toString() {
- return name + " " + Vocabulary.word(ownerID);
++ return name;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
index 0000000,3c38e60..9eecd0c
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
@@@ -1,0 -1,86 +1,87 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff;
+
+ import java.util.ArrayList;
-import java.util.List;
++import java.util.List;
+
-import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.phrase.Hypothesis;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+
+ /**
+ * This feature just counts rules that are used. You can restrict it with a number of flags:
+ *
+ * -owner OWNER
+ * Only count rules owned by OWNER
+ * -target|-source
+ * Only count the target or source side (plus the LHS)
+ *
+ * TODO: add an option to separately provide a list of rule counts, restrict to counts above a threshold.
+ */
+ public class PhrasePenalty extends StatelessFF {
+
- private int owner = 0;
++ private final OwnerId owner;
+ private float value = 1.0f;
+
+ public PhrasePenalty(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "PhrasePenalty", args, config);
+ if (parsedArgs.containsKey("owner"))
- this.owner = Vocabulary.id(parsedArgs.get("owner"));
++ this.owner = OwnerMap.register(parsedArgs.get("owner"));
+ else // default
- this.owner = Vocabulary.id("pt");
++ this.owner = OwnerMap.register("pt");
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
- && (owner == 0 || rule.getOwner() == owner))
++ && (rule.getOwner().equals(owner)))
+ acc.add(denseFeatureIndex, value);
+
+ return null;
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+ ArrayList<String> names = new ArrayList<String>();
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * Returns the *weighted* estimate.
+ *
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ if (rule != null && rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE
- && (owner == 0 || rule.getOwner() == owner))
++ && (rule.getOwner().equals(owner)))
+ return weights.getDense(denseFeatureIndex) * value;
+ return 0.0f;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
index 0000000,5ba0c66..3ffbf65
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
@@@ -1,0 -1,74 +1,77 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff;
+
+ import java.util.List;
+
-import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
+ /*
+ * This feature computes a bin for the rule and activates a feature for it. It requires access to
+ * the index of the RarityPenalty field, from which the rule count can be computed.
+ */
+ public class RuleCountBin extends StatelessFF {
+
+ private static final Logger LOG = LoggerFactory.getLogger(RuleCountBin.class);
+ private int field = -1;
++ private final OwnerId owner;
+
+ public RuleCountBin(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "RuleCountBin", args, config);
++ owner = OwnerMap.register("pt");
+
+ field = Integer.parseInt(parsedArgs.get("field"));
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
- if (rule.getOwner() != Vocabulary.id("pt"))
++ if (rule.getOwner().equals(owner))
+ return null;
+
+ float rarityPenalty = -rule.getFeatureVector().getSparse(String.format("tm_pt_%d", field));
+ int count = (int) (1.0 - Math.log(rarityPenalty));
+
+ String feature = "RuleCountBin_inf";
+
+ int[] bins = { 1, 2, 4, 8, 16, 32, 64, 128, 1000, 10000 };
+ for (int k : bins) {
+ if (count <= k) {
+ feature = String.format("RuleCountBin_%d", k);
+ break;
+ }
+ }
+
+ LOG.debug("RuleCountBin({}) = {} ==> {}", rarityPenalty, count, feature);
+
+ acc.add(feature, 1.0f);
+
+ return null;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index 0000000,909e481..308d38a
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@@ -1,0 -1,123 +1,126 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff;
+
+ import static com.google.common.cache.CacheBuilder.newBuilder;
++import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
+
+ import java.util.List;
+
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+
+ import com.google.common.cache.Cache;
+
+ /**
+ * This feature fires for rule ids.
+ * Firing can be restricted to rules from a certain owner, and rule ids
+ * can be generated from source side and/or target side.
+ */
+ public class RuleFF extends StatelessFF {
+
+ private enum Sides { SOURCE, TARGET, BOTH };
+
+ private static final String NAME = "RuleFF";
+ // value to fire for features
+ private static final int VALUE = 1;
+ // whether this feature is restricted to a certain grammar/owner
+ private final boolean ownerRestriction;
+ // the grammar/owner this feature is restricted to fire
- private final int owner;
++ private final OwnerId owner;
+ // what part of the rule should be extracted;
+ private final Sides sides;
+ // Strings separating words and rule sides
+ private static final String SEPARATOR = "~";
+ private static final String SIDES_SEPARATOR = "->";
+
+ private final Cache<Rule, String> featureCache;
+
+ public RuleFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, NAME, args, config);
+
+ ownerRestriction = (parsedArgs.containsKey("owner")) ? true : false;
- owner = ownerRestriction ? Vocabulary.id(parsedArgs.get("owner")) : 0;
++ owner = ownerRestriction ? OwnerMap.register(parsedArgs.get("owner")) : UNKNOWN_OWNER_ID;
+
+ if (parsedArgs.containsKey("sides")) {
+ final String sideValue = parsedArgs.get("sides");
+ if (sideValue.equalsIgnoreCase("source")) {
+ sides = Sides.SOURCE;
+ } else if (sideValue.equalsIgnoreCase("target")) {
+ sides = Sides.TARGET;
+ } else if (sideValue.equalsIgnoreCase("both")){
+ sides = Sides.BOTH;
+ } else {
+ throw new RuntimeException("Unknown side value.");
+ }
+ } else {
+ sides = Sides.BOTH;
+ }
+
+ // initialize cache
+ if (parsedArgs.containsKey("cacheSize")) {
+ featureCache = newBuilder().maximumSize(Integer.parseInt(parsedArgs.get("cacheSize"))).build();
+ } else {
+ featureCache = newBuilder().maximumSize(config.cachedRuleSize).build();
+ }
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
- if (ownerRestriction && rule.getOwner() != owner) {
++ if (ownerRestriction && !rule.getOwner().equals(owner)) {
+ return null;
+ }
+
+ String featureName = featureCache.getIfPresent(rule);
+ if (featureName == null) {
+ featureName = getRuleString(rule);
+ featureCache.put(rule, featureName);
+ }
+ acc.add(featureName, VALUE);
+
+ return null;
+ }
+
+ /**
+ * Obtains the feature id for the given rule.
+ * @param rule
+ * @return String representing the feature name.s
+ */
+ private String getRuleString(final Rule rule) {
+ final StringBuilder sb = new StringBuilder(Vocabulary.word(rule.getLHS()))
+ .append(SIDES_SEPARATOR);
+ if (sides == Sides.SOURCE || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getFrench(), SEPARATOR));
+ }
+ sb.append(SIDES_SEPARATOR);
+ if (sides == Sides.TARGET || sides == Sides.BOTH) {
+ sb.append(Vocabulary.getWords(rule.getEnglish(), SEPARATOR));
+ }
+ return sb.toString();
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
index 0000000,fa4c4af..861cf35
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
@@@ -1,0 -1,365 +1,368 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.fragmentlm;
+
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.Collection;
+ import java.util.Collections;
+ import java.util.HashMap;
+ import java.util.List;
+ import java.util.Stack;
+
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.ff.StatefulFF;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
++import org.apache.joshua.decoder.ff.tm.OwnerId;
++import org.apache.joshua.decoder.ff.tm.OwnerMap;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.hypergraph.HyperEdge;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
+ /**
+ * <p>Feature function that reads in a list of language model fragments and matches them against the
+ * hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments
+ * (supplied as input) are assembled. These LM fragments are presumably useful in ensuring
+ * grammaticality and can be independent of the translation model fragments.</p>
+ *
+ * <p>Usage: in the Joshua Configuration file, put</p>
+ *
+ * <code>feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE</code>
+ *
+ * <p>LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for.
+ * The format of the file is one fragment per line in PTB format, e.g.:</p>
+ *
+ * <code>(S NP (VP (VBD said) SBAR) (. .))</code>
+ *
+ * <p>RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format
+ * that Joshua uses. This mapping is necessary because Joshua's rules have been flattened, meaning
+ * that their internal structure has been removed, yet this structure is needed for matching LM
+ * fragments. The format of the file is</p>
+ *
+ * <code>FRAGMENT ||| RULE-TARGET-SIDE</code>
+ *
+ * <p>for example,</p>
+ *
+ * <code>(S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP
+ * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR</code>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public class FragmentLMFF extends StatefulFF {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FragmentLMFF.class);
+
+ /*
+ * When building a fragment from a rule rooted in the hypergraph, this parameter determines how
+ * deep we'll go. Smaller values mean less hypergraph traversal but may also limit the LM
+ * fragments that can be fired.
+ */
+ private int BUILD_DEPTH = 1;
+
+ /*
+ * The maximum depth of a fragment, defined as the longest path from the fragment root to any of
+ * its leaves.
+ */
+ private int MAX_DEPTH = 0;
+
+ /*
+ * This is the minimum depth for lexicalized LM fragments. This allows you to easily exclude small
+ * depth-one fragments that may be overfit to the training data. A depth of 1 (the default) does
+ * not exclude any fragments.
+ */
+ private int MIN_LEX_DEPTH = 1;
+
+ /*
+ * Set to true to activate meta-features.
+ */
+ private boolean OPTS_DEPTH = false;
+
+ /*
+ * This contains a list of the language model fragments, indexed by LHS.
+ */
+ private HashMap<String, ArrayList<Tree>> lmFragments = null;
+
+ private int numFragments = 0;
+
+ /* The location of the file containing the language model fragments */
+ private String fragmentLMFile = "";
+
+ /**
+ * @param weights a {@link org.apache.joshua.decoder.ff.FeatureVector} with weights
+ * @param args arguments passed to the feature function
+ * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "FragmentLMFF", args, config);
+
+ lmFragments = new HashMap<String, ArrayList<Tree>>();
+
+ fragmentLMFile = parsedArgs.get("lm");
+ BUILD_DEPTH = Integer.parseInt(parsedArgs.get("build-depth"));
+ MAX_DEPTH = Integer.parseInt(parsedArgs.get("max-depth"));
+ MIN_LEX_DEPTH = Integer.parseInt(parsedArgs.get("min-lex-depth"));
+
+ /* Read in the language model fragments */
+ try {
+ Collection<Tree> trees = PennTreebankReader.readTrees(fragmentLMFile);
+ for (Tree fragment : trees) {
+ addLMFragment(fragment);
+
+ // System.err.println(String.format("Read fragment: %s",
+ // lmFragments.get(lmFragments.size()-1)));
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(String.format("* WARNING: couldn't read fragment LM file '%s'",
+ fragmentLMFile), e);
+ }
+ LOG.info("FragmentLMFF: Read {} LM fragments from '{}'", numFragments, fragmentLMFile);
+ }
+
+ /**
+ * Add the provided fragment to the language model, subject to some filtering.
+ *
+ * @param fragment a {@link org.apache.joshua.decoder.ff.fragmentlm.Tree} fragment
+ */
+ public void addLMFragment(Tree fragment) {
+ if (lmFragments == null)
+ return;
+
+ int fragmentDepth = fragment.getDepth();
+
+ if (MAX_DEPTH != 0 && fragmentDepth > MAX_DEPTH) {
+ LOG.warn("Skipping fragment {} (depth {} > {})", fragment, fragmentDepth, MAX_DEPTH);
+ return;
+ }
+
+ if (MIN_LEX_DEPTH > 1 && fragment.isLexicalized() && fragmentDepth < MIN_LEX_DEPTH) {
+ LOG.warn("Skipping fragment {} (lex depth {} < {})", fragment, fragmentDepth, MIN_LEX_DEPTH);
+ return;
+ }
+
+ if (lmFragments.get(fragment.getRule()) == null) {
+ lmFragments.put(fragment.getRule(), new ArrayList<Tree>());
+ }
+ lmFragments.get(fragment.getRule()).add(fragment);
+ numFragments++;
+ }
+
+ /**
+ * This function computes the features that fire when the current rule is applied. The features
+ * that fire are any LM fragments that match the fragment associated with the current rule. LM
+ * fragments may recurse over the tail nodes, following 1-best backpointers until the fragment
+ * either matches or fails.
+ *
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to be utilized within computation
+ * @param tailNodes {@link java.util.List} of {@link org.apache.joshua.decoder.hypergraph.HGNode} tail nodes
+ * @param i todo
+ * @param j todo
+ * @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @return the new dynamic programming state (null for stateless features)
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ /*
+ * Get the fragment associated with the target side of this rule.
+ *
+ * This could be done more efficiently. For example, just build the tree fragment once and then
+ * pattern match against it. This would circumvent having to build the tree possibly once every
+ * time you try to apply a rule.
+ */
+ Tree baseTree = Tree.buildTree(rule, tailNodes, BUILD_DEPTH);
+
+ Stack<Tree> nodeStack = new Stack<Tree>();
+ nodeStack.add(baseTree);
+ while (!nodeStack.empty()) {
+ Tree tree = nodeStack.pop();
+ if (tree == null)
+ continue;
+
+ if (lmFragments.get(tree.getRule()) != null) {
+ for (Tree fragment : lmFragments.get(tree.getRule())) {
+ // System.err.println(String.format("Does\n %s match\n %s??\n -> %s", fragment, tree,
+ // match(fragment, tree)));
+
+ if (fragment.getLabel() == tree.getLabel() && match(fragment, tree)) {
+ // System.err.println(String.format(" FIRING: matched %s against %s", fragment, tree));
+ acc.add(fragment.escapedString(), 1);
+ if (OPTS_DEPTH)
+ if (fragment.isLexicalized())
+ acc.add(String.format("FragmentFF_lexdepth%d", fragment.getDepth()), 1);
+ else
+ acc.add(String.format("FragmentFF_depth%d", fragment.getDepth()), 1);
+ }
+ }
+ }
+
+ // We also need to try matching rules against internal nodes of the fragment corresponding to
+ // this
+ // rule
+ if (tree.getChildren() != null)
+ for (Tree childNode : tree.getChildren()) {
+ if (!childNode.isBoundary())
+ nodeStack.add(childNode);
+ }
+ }
+
+ return new FragmentState(baseTree);
+ }
+
+ /**
+ * Matches the fragment against the (possibly partially-built) tree. Assumption
+ *
+ * @param fragment the language model fragment
+ * @param tree the tree to match against (expanded from the hypergraph)
+ * @return
+ */
+ private boolean match(Tree fragment, Tree tree) {
+ // System.err.println(String.format("MATCH(%s,%s)", fragment, tree));
+
+ /* Make sure the root labels match. */
+ if (fragment.getLabel() != tree.getLabel()) {
+ return false;
+ }
+
+ /* Same number of kids? */
+ List<Tree> fkids = fragment.getChildren();
+ if (fkids.size() > 0) {
+ List<Tree> tkids = tree.getChildren();
+ if (fkids.size() != tkids.size()) {
+ return false;
+ }
+
+ /* Do the kids match on all labels? */
+ for (int i = 0; i < fkids.size(); i++)
+ if (fkids.get(i).getLabel() != tkids.get(i).getLabel())
+ return false;
+
+ /* Recursive match. */
+ for (int i = 0; i < fkids.size(); i++) {
+ if (!match(fkids.get(i), tkids.get(i)))
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public DPState computeFinal(HGNode tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence,
+ Accumulator acc) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ public static void main(String[] args) {
+ /* Add an LM fragment, then create a dummy multi-level hypergraph to match the fragment against. */
+ // FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(), (StateComputer) null, "");
+ FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(),
+ new String[] {"-lm", "test/fragments.txt", "-map", "test/mapping.txt"}, null);
+
+ Tree fragment = Tree.fromString("(S NP (VP (VBD \"said\") SBAR) (. \".\"))");
+
+ Rule ruleS = new HieroFormatReader()
+ .parseLine("[S] ||| the man [VP,1] [.,2] ||| the man [VP,1] [.,2] ||| 0");
+ Rule ruleVP = new HieroFormatReader()
+ .parseLine("[VP] ||| said [SBAR,1] ||| said [SBAR,1] ||| 0");
+ Rule ruleSBAR = new HieroFormatReader()
+ .parseLine("[SBAR] ||| that he was done ||| that he was done ||| 0");
+ Rule rulePERIOD = new HieroFormatReader().parseLine("[.] ||| . ||| . ||| 0");
+
- ruleS.setOwner(0);
- ruleVP.setOwner(0);
- ruleSBAR.setOwner(0);
- rulePERIOD.setOwner(0);
++ final OwnerId owner = OwnerMap.register("0");
++ ruleS.setOwner(owner);
++ ruleVP.setOwner(owner);
++ ruleSBAR.setOwner(owner);
++ rulePERIOD.setOwner(owner);
+
+ HyperEdge edgeSBAR = new HyperEdge(ruleSBAR, 0.0f, 0.0f, null, (SourcePath) null);
+
+ HGNode nodeSBAR = new HGNode(3, 7, ruleSBAR.getLHS(), null, edgeSBAR, 0.0f);
+ ArrayList<HGNode> tailNodesVP = new ArrayList<HGNode>();
+ Collections.addAll(tailNodesVP, nodeSBAR);
+ HyperEdge edgeVP = new HyperEdge(ruleVP, 0.0f, 0.0f, tailNodesVP, (SourcePath) null);
+ HGNode nodeVP = new HGNode(2, 7, ruleVP.getLHS(), null, edgeVP, 0.0f);
+
+ HyperEdge edgePERIOD = new HyperEdge(rulePERIOD, 0.0f, 0.0f, null, (SourcePath) null);
+ HGNode nodePERIOD = new HGNode(7, 8, rulePERIOD.getLHS(), null, edgePERIOD, 0.0f);
+
+ ArrayList<HGNode> tailNodes = new ArrayList<HGNode>();
+ Collections.addAll(tailNodes, nodeVP, nodePERIOD);
+
+ Tree tree = Tree.buildTree(ruleS, tailNodes, 1);
+ boolean matched = fragmentLMFF.match(fragment, tree);
+ LOG.info("Does\n {} match\n {}??\n -> {}", fragment, tree, matched);
+ }
+
+ /**
+ * Maintains a state pointer used by KenLM to implement left-state minimization.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ */
+ public class FragmentState extends DPState {
+
+ private Tree tree = null;
+
+ public FragmentState(Tree tree) {
+ this.tree = tree;
+ }
+
+ /**
+ * Every tree is unique.
+ *
+ * Some savings could be had here if we grouped together items with the same string.
+ */
+ @Override
+ public int hashCode() {
+ return tree.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return (other instanceof FragmentState && this == other);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("[FragmentState %s]", tree);
+ }
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
index 0000000,3fea410..9388ed7
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
@@@ -1,0 -1,527 +1,495 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.lm;
+
+ import java.io.IOException;
+ import java.util.ArrayList;
+ import java.util.Arrays;
-import java.util.HashMap;
+ import java.util.LinkedList;
+ import java.util.List;
+
-import com.google.common.primitives.Ints;
-
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.Support;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
+ import org.apache.joshua.decoder.ff.StatefulFF;
+ import org.apache.joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley;
-import org.apache.joshua.decoder.ff.lm.KenLM;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
+ import org.apache.joshua.util.FormatUtils;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
++import com.google.common.annotations.VisibleForTesting;
++import com.google.common.primitives.Ints;
++
+ /**
+ * This class performs the following:
+ * <ol>
+ * <li>Gets the additional LM score due to combinations of small items into larger ones by using
+ * rules</li>
+ * <li>Gets the LM state</li>
+ * <li>Gets the left-side LM state estimation score</li>
+ * </ol>
- *
++ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+ public class LanguageModelFF extends StatefulFF {
+
- private static final Logger LOG = LoggerFactory.getLogger(LanguageModelFF.class);
++ static final Logger LOG = LoggerFactory.getLogger(LanguageModelFF.class);
+
+ public static int LM_INDEX = 0;
+ private int startSymbolId;
+
+ /**
+ * N-gram language model. We assume the language model is in ARPA format for equivalent state:
- *
++ *
+ * <ol>
+ * <li>We assume it is a backoff lm, and high-order ngram implies low-order ngram; absense of
+ * low-order ngram implies high-order ngram</li>
+ * <li>For a ngram, existence of backoffweight => existence a probability Two ways of dealing with
+ * low counts:
+ * <ul>
+ * <li>SRILM: don't multiply zeros in for unknown words</li>
+ * <li>Pharaoh: cap at a minimum score exp(-10), including unknown words</li>
+ * </ul>
+ * </li>
+ * </ol>
+ */
+ protected NGramLanguageModel languageModel;
+
+ /**
+ * We always use this order of ngram, though the LMGrammar may provide higher order probability.
+ */
+ protected final int ngramOrder;
+
+ /*
+ * We cache the weight of the feature since there is only one.
+ */
+ protected float weight;
+ protected String type;
+ protected String path;
+
+ /* Whether this is a class-based LM */
- private boolean isClassLM;
++ protected boolean isClassLM;
+ private ClassMap classMap;
+
- protected class ClassMap {
-
- private final int OOV_id = Vocabulary.getUnknownId();
- private HashMap<Integer, Integer> classMap;
-
- public ClassMap(String file_name) throws IOException {
- this.classMap = new HashMap<Integer, Integer>();
- read(file_name);
- }
-
- public int getClassID(int wordID) {
- return this.classMap.getOrDefault(wordID, OOV_id);
- }
-
- /**
- * Reads a class map from file.
- *
- * @param file_name
- * @throws IOException
- */
- private void read(String file_name) throws IOException {
-
- int lineno = 0;
- for (String line: new org.apache.joshua.util.io.LineReader(file_name, false)) {
- lineno++;
- String[] lineComp = line.trim().split("\\s+");
- try {
- this.classMap.put(Vocabulary.id(lineComp[0]), Vocabulary.id(lineComp[1]));
- } catch (java.lang.ArrayIndexOutOfBoundsException e) {
- LOG.warn("bad vocab line #{} '{}'", lineno, line);
- LOG.warn(e.getMessage(), e);
- }
- }
- }
-
- }
-
+ public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, String.format("lm_%d", LanguageModelFF.LM_INDEX++), args, config);
+
+ this.type = parsedArgs.get("lm_type");
- this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
++ this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
+ this.path = parsedArgs.get("lm_file");
+
- if (parsedArgs.containsKey("class_map"))
- try {
- this.isClassLM = true;
- this.classMap = new ClassMap(parsedArgs.get("class_map"));
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
++ if (parsedArgs.containsKey("class_map")) {
++ this.isClassLM = true;
++ this.classMap = new ClassMap(parsedArgs.get("class_map"));
++ }
+
+ // The dense feature initialization hasn't happened yet, so we have to retrieve this as sparse
+ this.weight = weights.getSparse(name);
+
+ initializeLM();
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
- ArrayList<String> names = new ArrayList<String>();
++ final ArrayList<String> names = new ArrayList<String>(1);
+ names.add(name);
+ return names;
+ }
+
+ /**
+ * Initializes the underlying language model.
+ */
+ protected void initializeLM() {
+ if (type.equals("kenlm")) {
+ this.languageModel = new KenLM(ngramOrder, path);
+
+ } else if (type.equals("berkeleylm")) {
+ this.languageModel = new LMGrammarBerkeley(ngramOrder, path);
+
+ } else {
+ String msg = String.format("* FATAL: Invalid backend lm_type '%s' for LanguageModel", type)
+ + "* Permissible values for 'lm_type' are 'kenlm' and 'berkeleylm'";
+ throw new RuntimeException(msg);
+ }
+
+ Vocabulary.registerLanguageModel(this.languageModel);
+ Vocabulary.id(config.default_non_terminal);
+
+ startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
+ }
+
+ public NGramLanguageModel getLM() {
+ return this.languageModel;
+ }
+
++ public boolean isClassLM() {
++ return this.isClassLM;
++ }
++
+ public String logString() {
- if (languageModel != null)
- return String.format("%s, order %d (weight %.3f)", name, languageModel.getOrder(), weight);
- else
- return "WHOA";
++ return String.format("%s, order %d (weight %.3f), classLm=%s", name, languageModel.getOrder(), weight, isClassLM);
+ }
+
+ /**
- * Computes the features incurred along this edge. Note that these features are unweighted costs
- * of the feature; they are the feature cost, not the model cost, or the inner product of them.
++ * Computes the features incurred along this edge. Note that these features
++ * are unweighted costs of the feature; they are the feature cost, not the
++ * model cost, or the inner product of them.
+ */
+ @Override
- public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
- Sentence sentence, Accumulator acc) {
-
- NgramDPState newState = null;
- if (rule != null) {
- if (config.source_annotations) {
- // Get source side annotations and project them to the target side
- newState = computeTransition(getTags(rule, i, j, sentence), tailNodes, acc);
- }
- else {
- if (this.isClassLM) {
- // Use a class language model
- // Return target side classes
- newState = computeTransition(getClasses(rule), tailNodes, acc);
- }
- else {
- // Default LM
- newState = computeTransition(rule.getEnglish(), tailNodes, acc);
- }
- }
++ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j,
++ SourcePath sourcePath, Sentence sentence, Accumulator acc) {
+
++ if (rule == null) {
++ return null;
+ }
+
- return newState;
++ int[] words;
++ if (config.source_annotations) {
++ // get source side annotations and project them to the target side
++ words = getTags(rule, i, j, sentence);
++ } else {
++ words = getRuleIds(rule);
++ }
++
++ return computeTransition(words, tailNodes, acc);
++
++ }
++
++ /**
++ * Retrieve ids from rule. These are either simply the rule ids on the target
++ * side, their corresponding class map ids, or the configured source-side
++ * annotation tags.
++ */
++ @VisibleForTesting
++ public int[] getRuleIds(final Rule rule) {
++ if (this.isClassLM) {
++ // map words to class ids
++ return getClasses(rule);
++ }
++ // Regular LM: use rule word ids
++ return rule.getEnglish();
+ }
+
+ /**
+ * Input sentences can be tagged with information specific to the language model. This looks for
+ * such annotations by following a word's alignments back to the source words, checking for
+ * annotations, and replacing the surface word if such annotations are found.
+ * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule} to use
+ * @param begin todo
+ * @param end todo
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return todo
+ */
+ protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) {
+ /* Very important to make a copy here, so the original rule is not modified */
+ int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
+ byte[] alignments = rule.getAlignment();
+
+ // System.err.println(String.format("getTags() %s", rule.getRuleString()));
+
+ /* For each target-side token, project it to each of its source-language alignments. If any of those
+ * are annotated, take the first annotation and quit.
+ */
+ if (alignments != null) {
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i] > 0) { // skip nonterminals
+ for (int j = 0; j < alignments.length; j += 2) {
+ if (alignments[j] == i) {
+ String annotation = sentence.getAnnotation((int)alignments[i] + begin, "class");
+ if (annotation != null) {
- // System.err.println(String.format(" word %d source %d abs %d annotation %d/%s",
++ // System.err.println(String.format(" word %d source %d abs %d annotation %d/%s",
+ // i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation)));
+ tokens[i] = Vocabulary.id(annotation);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return tokens;
+ }
+
- /**
- * Sets the class map if this is a class LM
++ /**
++ * Sets the class map if this is a class LM
+ * @param fileName a string path to a file
+ * @throws IOException if there is an error reading the input file
+ */
+ public void setClassMap(String fileName) throws IOException {
+ this.classMap = new ClassMap(fileName);
+ }
+
+ /**
+ * Replace each word in a rule with the target side classes.
+ * @param rule {@link org.apache.joshua.decoder.ff.tm.Rule} to use when obtaining tokens
+ * @return int[] of tokens
+ */
+ protected int[] getClasses(Rule rule) {
+ if (this.classMap == null) {
+ throw new RuntimeException("The class map is not set. Cannot use the class LM ");
+ }
+ /* Very important to make a copy here, so the original rule is not modified */
+ int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
+ for (int i = 0; i < tokens.length; i++) {
+ if (tokens[i] > 0 ) {
+ tokens[i] = this.classMap.getClassID(tokens[i]);
+ }
+ }
+ return tokens;
+ }
+
+ @Override
+ public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
+ Accumulator acc) {
+ return computeFinalTransition((NgramDPState) tailNode.getDPState(stateIndex), acc);
+ }
+
+ /**
+ * This function computes all the complete n-grams found in the rule, as well as the incomplete
+ * n-grams on the left-hand side.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+
+ float estimate = 0.0f;
+ boolean considerIncompleteNgrams = true;
+
- int[] enWords = rule.getEnglish();
++ int[] enWords = getRuleIds(rule);
+
+ List<Integer> words = new ArrayList<Integer>();
+ boolean skipStart = (enWords[0] == startSymbolId);
+
+ /*
+ * Move through the words, accumulating language model costs each time we have an n-gram (n >=
+ * 2), and resetting the series of words when we hit a nonterminal.
+ */
+ for (int c = 0; c < enWords.length; c++) {
+ int currentWord = enWords[c];
+ if (FormatUtils.isNonterminal(currentWord)) {
+ estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+ words.clear();
+ skipStart = false;
+ } else {
+ words.add(currentWord);
+ }
+ }
+ estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
+
+ return weight * estimate;
+ }
+
+ /**
+ * Estimates the future cost of a rule. For the language model feature, this is the sum of the
+ * costs of the leftmost k-grams, k = [1..n-1].
+ */
+ @Override
+ public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+ NgramDPState state = (NgramDPState) currentState;
+
+ float estimate = 0.0f;
+ int[] leftContext = state.getLeftLMStateWords();
+
+ if (null != leftContext) {
+ boolean skipStart = true;
+ if (leftContext[0] != startSymbolId) {
+ skipStart = false;
+ }
+ estimate += scoreChunkLogP(leftContext, true, skipStart);
+ }
+ return weight * estimate;
+ }
+
+ /**
+ * Compute the cost of a rule application. The cost of applying a rule is computed by determining
+ * the n-gram costs for all n-grams created by this rule application, and summing them. N-grams
+ * are created when (a) terminal words in the rule string are followed by a nonterminal (b)
+ * terminal words in the rule string are preceded by a nonterminal (c) we encounter adjacent
+ * nonterminals. In all of these situations, the corresponding boundary words of the node in the
+ * hypergraph represented by the nonterminal must be retrieved.
- *
++ *
+ * IMPORTANT: only complete n-grams are scored. This means that hypotheses with fewer words
+ * than the complete n-gram state remain *unscored*. This fact adds a lot of complication to the
+ * code, including the use of the computeFinal* family of functions, which correct this fact for
+ * sentences that are too short on the final transition.
+ */
+ private NgramDPState computeTransition(int[] enWords, List<HGNode> tailNodes, Accumulator acc) {
+
+ int[] current = new int[this.ngramOrder];
+ int[] shadow = new int[this.ngramOrder];
+ int ccount = 0;
+ float transitionLogP = 0.0f;
+ int[] left_context = null;
+
+ for (int c = 0; c < enWords.length; c++) {
+ int curID = enWords[c];
+
+ if (FormatUtils.isNonterminal(curID)) {
+ int index = -(curID + 1);
+
+ NgramDPState state = (NgramDPState) tailNodes.get(index).getDPState(stateIndex);
+ int[] left = state.getLeftLMStateWords();
+ int[] right = state.getRightLMStateWords();
+
+ // Left context.
+ for (int i = 0; i < left.length; i++) {
+ current[ccount++] = left[i];
+
+ if (left_context == null && ccount == this.ngramOrder - 1)
+ left_context = Arrays.copyOf(current, ccount);
+
+ if (ccount == this.ngramOrder) {
+ // Compute the current word probability, and remove it.
+ float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
+ // System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
+ transitionLogP += prob;
+ System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
+ int[] tmp = current;
+ current = shadow;
+ shadow = tmp;
+ --ccount;
+ }
+ }
+ System.arraycopy(right, 0, current, ccount - right.length, right.length);
+ } else { // terminal words
+ current[ccount++] = curID;
+
+ if (left_context == null && ccount == this.ngramOrder - 1)
+ left_context = Arrays.copyOf(current, ccount);
+
+ if (ccount == this.ngramOrder) {
+ // Compute the current word probability, and remove it.s
+ float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
+ // System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
+ transitionLogP += prob;
+ System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
+ int[] tmp = current;
+ current = shadow;
+ shadow = tmp;
+ --ccount;
+ }
+ }
+ }
+ // acc.add(name, transitionLogP);
+ acc.add(denseFeatureIndex, transitionLogP);
+
+ if (left_context != null) {
+ return new NgramDPState(left_context, Arrays.copyOfRange(current, ccount - this.ngramOrder
+ + 1, ccount));
+ } else {
+ int[] context = Arrays.copyOf(current, ccount);
+ return new NgramDPState(context, context);
+ }
+ }
+
+ /**
+ * This function differs from regular transitions because we incorporate the cost of incomplete
+ * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
+ * requested when the object was created).
- *
++ *
+ * @param state the dynamic programming state
+ * @return the final transition probability (including incomplete n-grams)
+ */
+ private NgramDPState computeFinalTransition(NgramDPState state, Accumulator acc) {
+
+ // System.err.println(String.format("LanguageModel::computeFinalTransition()"));
+
+ float res = 0.0f;
+ LinkedList<Integer> currentNgram = new LinkedList<Integer>();
+ int[] leftContext = state.getLeftLMStateWords();
+ int[] rightContext = state.getRightLMStateWords();
+
+ for (int i = 0; i < leftContext.length; i++) {
+ int t = leftContext[i];
+ currentNgram.add(t);
+
+ if (currentNgram.size() >= 2) { // start from bigram
+ float prob = this.languageModel.ngramLogProbability(Support.toArray(currentNgram),
+ currentNgram.size());
+ res += prob;
+ }
+ if (currentNgram.size() == this.ngramOrder)
+ currentNgram.removeFirst();
+ }
+
+ // Tell the accumulator
+ // acc.add(name, res);
+ acc.add(denseFeatureIndex, res);
+
+ // State is the same
+ return new NgramDPState(leftContext, rightContext);
+ }
+
+
+ /**
+ * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)}
+ */
+ private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
+ boolean skipStart) {
+ return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart);
+ }
+
+ /**
+ * This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It
+ * computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1
+ * words.
- *
++ *
+ * @param words
+ * @param considerIncompleteNgrams
+ * @param skipStart
+ * @return the phrase log probability
+ */
+ private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams,
+ boolean skipStart) {
+
+ float score = 0.0f;
+ if (words.length > 0) {
+ int startIndex;
+ if (!considerIncompleteNgrams) {
+ startIndex = this.ngramOrder;
+ } else if (skipStart) {
+ startIndex = 2;
+ } else {
+ startIndex = 1;
+ }
+ score = this.languageModel.sentenceLogProbability(words, this.ngramOrder, startIndex);
+ }
+
+ return score;
+ }
+
+ /**
+ * Public method to set LM_INDEX back to 0.
+ * Required if multiple instances of the JoshuaDecoder live in the same JVM.
+ */
+ public static void resetLmIndex() {
+ LM_INDEX = 0;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 0000000,533365c..88dc647
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@@ -1,0 -1,202 +1,193 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.lm;
+
-import java.util.ArrayList;
++import static org.apache.joshua.util.FormatUtils.isNonterminal;
++
+ import java.util.List;
+ import java.util.concurrent.ConcurrentHashMap;
+
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.chart_parser.SourcePath;
+ import org.apache.joshua.decoder.ff.FeatureVector;
-import org.apache.joshua.decoder.ff.lm.KenLM;
+ import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair;
+ import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+ import org.apache.joshua.decoder.ff.state_maintenance.KenLMState;
+ import org.apache.joshua.decoder.ff.tm.Rule;
+ import org.apache.joshua.decoder.hypergraph.HGNode;
+ import org.apache.joshua.decoder.segment_file.Sentence;
-import org.apache.joshua.util.FormatUtils;
+
+ /**
+ * Wrapper for KenLM LMs with left-state minimization. We inherit from the regular
- *
++ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ */
+ public class StateMinimizingLanguageModel extends LanguageModelFF {
+
+ // maps from sentence numbers to KenLM-side pools used to allocate state
+ private static final ConcurrentHashMap<Integer, Long> poolMap = new ConcurrentHashMap<Integer, Long>();
+
+ public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, args, config);
+ this.type = "kenlm";
+ if (parsedArgs.containsKey("lm_type") && ! parsedArgs.get("lm_type").equals("kenlm")) {
+ String msg = "* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend"
+ + "* Remove lm_type from line or set to 'kenlm'";
+ throw new RuntimeException(msg);
+ }
+ }
-
- @Override
- public ArrayList<String> reportDenseFeatures(int index) {
- denseFeatureIndex = index;
-
- ArrayList<String> names = new ArrayList<String>();
- names.add(name);
- return names;
- }
+
+ /**
+ * Initializes the underlying language model.
+ */
+ @Override
+ public void initializeLM() {
-
++
+ // Override type (only KenLM supports left-state minimization)
+ this.languageModel = new KenLM(ngramOrder, path);
+
+ Vocabulary.registerLanguageModel(this.languageModel);
+ Vocabulary.id(config.default_non_terminal);
-
++
+ }
-
++
+ /**
- * Estimates the cost of a rule. We override here since KenLM can do it more efficiently
- * than the default {@link LanguageModelFF} class.
- *
- * Most of this function implementation is redundant with compute().
++ * Estimates the cost of a rule. We override here since KenLM can do it more
++ * efficiently than the default {@link LanguageModelFF} class.
+ */
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
-
- int[] ruleWords = rule.getEnglish();
-
- // The IDs we'll pass to KenLM
- long[] words = new long[ruleWords.length];
+
- for (int x = 0; x < ruleWords.length; x++) {
- int id = ruleWords[x];
++ int[] ruleWords = getRuleIds(rule);
+
- if (FormatUtils.isNonterminal(id)) {
- // For the estimate, we can just mark negative values
- words[x] = -1;
++ // map to ken lm ids
++ final long[] words = mapToKenLmIds(ruleWords, null, true);
+
- } else {
- // Terminal: just add it
- words[x] = id;
- }
- }
-
+ // Get the probability of applying the rule and the new state
+ return weight * ((KenLM) languageModel).estimateRule(words);
+ }
-
++
+ /**
+ * Computes the features incurred along this edge. Note that these features are unweighted costs
+ * of the feature; they are the feature cost, not the model cost, or the inner product of them.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
- int[] ruleWords = config.source_annotations
- ? getTags(rule, i, j, sentence)
- : rule.getEnglish();
-
- // The IDs we'll pass to KenLM
- long[] words = new long[ruleWords.length];
++ if (rule == null ) {
++ return null;
++ }
+
- for (int x = 0; x < ruleWords.length; x++) {
- int id = ruleWords[x];
++ int[] ruleWords;
++ if (config.source_annotations) {
++ // get source side annotations and project them to the target side
++ ruleWords = getTags(rule, i, j, sentence);
++ } else {
++ ruleWords = getRuleIds(rule);
++ }
+
- if (FormatUtils.isNonterminal(id)) {
- // Nonterminal: retrieve the KenLM long that records the state
- int index = -(id + 1);
- KenLMState state = (KenLMState) tailNodes.get(index).getDPState(stateIndex);
- words[x] = -state.getState();
++ // map to ken lm ids
++ final long[] words = mapToKenLmIds(ruleWords, tailNodes, false);
+
- } else {
- // Terminal: just add it
- words[x] = id;
- }
- }
-
- int sentID = sentence.id();
++ final int sentID = sentence.id();
+ // Since sentId is unique across threads, next operations are safe, but not atomic!
+ if (!poolMap.containsKey(sentID)) {
+ poolMap.put(sentID, KenLM.createPool());
+ }
+
+ // Get the probability of applying the rule and the new state
- StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
++ final StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
+
+ // Record the prob
-// acc.add(name, pair.prob);
+ acc.add(denseFeatureIndex, pair.prob);
+
+ // Return the state
+ return pair.state;
+ }
+
+ /**
++ * Maps given array of word/class ids to KenLM ids. For estimating cost and computing,
++ * state retrieval differs slightly.
++ */
++ private long[] mapToKenLmIds(int[] ids, List<HGNode> tailNodes, boolean isOnlyEstimate) {
++ // The IDs we will to KenLM
++ long[] kenIds = new long[ids.length];
++ for (int x = 0; x < ids.length; x++) {
++ int id = ids[x];
++
++ if (isNonterminal(id)) {
++
++ if (isOnlyEstimate) {
++ // For the estimate, we can just mark negative values
++ kenIds[x] = -1;
++ } else {
++ // Nonterminal: retrieve the KenLM long that records the state
++ int index = -(id + 1);
++ final KenLMState state = (KenLMState) tailNodes.get(index).getDPState(stateIndex);
++ kenIds[x] = -state.getState();
++ }
++
++ } else {
++ // Terminal: just add it
++ kenIds[x] = id;
++ }
++ }
++ return kenIds;
++ }
++
++ /**
+ * Destroys the pool created to allocate state for this sentence. Called from the
+ * {@link org.apache.joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting
+ * this map here in KenLMFF statically allows pools to be shared across KenLM instances.
- *
++ *
+ * @param sentId a key in the poolmap table to destroy
+ */
+ public void destroyPool(int sentId) {
+ if (poolMap.containsKey(sentId))
+ KenLM.destroyPool(poolMap.get(sentId));
+ poolMap.remove(sentId);
+ }
+
+ /**
+ * This function differs from regular transitions because we incorporate the cost of incomplete
+ * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
+ * requested when the object was created).
- *
++ *
+ * KenLM already includes the prefix probabilities (of shorter n-grams on the left-hand side), so
+ * there's nothing that needs to be done.
+ */
+ @Override
+ public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
+ Accumulator acc) {
-
- // KenLMState state = (KenLMState) tailNode.getDPState(getStateIndex());
-
- // This is unnecessary
- // acc.add(name, 0.0f);
-
+ // The state is the same since no rule was applied
+ return new KenLMState();
+ }
+
+ /**
+ * KenLM probs already include the prefix probabilities (they are substracted out when merging
+ * states), so this doesn't need to do anything.
+ */
+ @Override
+ public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+ return 0.0f;
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
index 0000000,6c512bd..bc66331
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
@@@ -1,0 -1,228 +1,229 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.tm;
+
+ import java.util.Arrays;
+ import java.util.HashSet;
+ import java.util.List;
+
+ import org.apache.joshua.corpus.Vocabulary;
+ import org.apache.joshua.decoder.JoshuaConfiguration;
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+ import org.apache.joshua.decoder.phrase.PhraseTable;
+ import org.apache.joshua.decoder.segment_file.Token;
+ import org.apache.joshua.lattice.Arc;
+ import org.apache.joshua.lattice.Lattice;
+ import org.apache.joshua.lattice.Node;
-
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+
++import cern.colt.Arrays;
++
+ /**
+ * Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
+ * grammar.
+ * <p>
+ * <em>Note</em>: New classes implementing the <code>Grammar</code> interface should probably
+ * inherit from this class, unless a specific sorting technique different from that implemented by
+ * this class is required.
+ *
+ * @author Zhifei Li
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ */
+ public abstract class AbstractGrammar implements Grammar {
+
+ /** Logger for this class. */
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractGrammar.class);
+ /**
+ * Indicates whether the rules in this grammar have been sorted based on the latest feature
+ * function values.
+ */
+ protected boolean sorted = false;
+
+ /*
+ * The grammar's owner, used to determine which weights are applicable to the dense features found
+ * within.
+ */
- protected int owner = -1;
-
++ protected final OwnerId owner;
++
+ /*
+ * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
+ */
+ protected int maxSourcePhraseLength = -1;
+
+ /**
+ * Returns the longest source phrase read.
+ *
+ * @return the longest source phrase read (nonterminal + terminal symbols).
+ */
+ @Override
+ public int getMaxSourcePhraseLength() {
+ return maxSourcePhraseLength;
+ }
+
+ @Override
- public int getOwner() {
++ public OwnerId getOwner() {
+ return owner;
+ }
++
++ public int getSpanLimit() {
++ return spanLimit;
++ }
+
- /* The maximum span of the input this rule can be applied to. */
- protected int spanLimit = 1;
++ /* The maximum span of the input this grammar rules can be applied to. */
++ protected final int spanLimit;
+
- protected JoshuaConfiguration joshuaConfiguration;
++ protected final JoshuaConfiguration joshuaConfiguration;
+
+ /**
- * Constructs an empty, unsorted grammar.
- *
++ * Creates an empty, unsorted grammar with given owner and spanlimit
++ *
+ * @see Grammar#isSorted()
+ * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+ */
- public AbstractGrammar(JoshuaConfiguration config) {
- this.joshuaConfiguration = config;
++ public AbstractGrammar(final String owner, final JoshuaConfiguration config, final int spanLimit) {
+ this.sorted = false;
- }
-
- public AbstractGrammar(int owner, int spanLimit) {
- this.sorted = false;
- this.owner = owner;
++ this.owner = OwnerMap.register(owner);
++ this.joshuaConfiguration = config;
+ this.spanLimit = spanLimit;
+ }
+
+ public static final int OOV_RULE_ID = 0;
+
+ /**
+ * Cube-pruning requires that the grammar be sorted based on the latest feature functions. To
+ * avoid synchronization, this method should be called before multiple threads are initialized for
+ * parallel decoding
+ * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void sortGrammar(List<FeatureFunction> models) {
+ Trie root = getTrieRoot();
+ if (root != null) {
+ sort(root, models);
+ setSorted(true);
+ }
+ }
+
+ /* See Javadoc comments for Grammar interface. */
+ public boolean isSorted() {
+ return sorted;
+ }
+
+ /**
+ * Sets the flag indicating whether this grammar is sorted.
+ * <p>
+ * This method is called by {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar#sortGrammar(List)}
+ * to indicate that the grammar has been sorted.</p>
+ *
+ * <p>Its scope is protected so that child classes that override <code>sortGrammar</code> will also
+ * be able to call this method to indicate that the grammar has been sorted.</p>
+ *
+ * @param sorted set to true if the grammar is sorted
+ */
+ protected void setSorted(boolean sorted) {
+ this.sorted = sorted;
+ LOG.debug("This grammar is now sorted: {}", this);
+ }
+
+ /**
+ * Recursively sorts the grammar using the provided feature functions.
+ * <p>
+ * This method first sorts the rules stored at the provided node, then recursively calls itself on
+ * the child nodes of the provided node.
+ *
+ * @param node Grammar node in the <code>Trie</code> whose rules should be sorted.
+ * @param models Feature function models to use during sorting.
+ */
+ private void sort(Trie node, List<FeatureFunction> models) {
+
+ if (node != null) {
+ if (node.hasRules()) {
+ RuleCollection rules = node.getRuleCollection();
+ LOG.debug("Sorting node {}", Arrays.toString(rules.getSourceSide()));
+
+ /* This causes the rules at this trie node to be sorted */
+ rules.getSortedRules(models);
+
+ if (LOG.isDebugEnabled()) {
+ StringBuilder s = new StringBuilder();
+ for (Rule r : rules.getSortedRules(models)) {
+ s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| "
+ + Arrays.toString(r.getEnglish()) + " ||| " + r.getFeatureVector() + " ||| "
+ + r.getEstimatedCost() + " " + r.getClass().getName() + "@"
+ + Integer.toHexString(System.identityHashCode(r)));
+ }
+ LOG.debug("{}", s);
+ }
+ }
+
+ if (node.hasExtensions()) {
+ for (Trie child : node.getExtensions()) {
+ sort(child, models);
+ }
+ } else {
+ LOG.debug("Node has 0 children to extend: {}", node);
+ }
+ }
+ }
+
+ // write grammar to disk
+ public void writeGrammarOnDisk(String file) {
+ }
+
+ /**
+ * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
+ * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
+ *
+ * @param grammar Grammar in the Trie
+ * @param inputLattice the lattice representing the input sentence
+ * @param featureFunctions a list of feature functions used for scoring
+ * @param onlyTrue determine if word is actual OOV.
+ */
+ public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice,
+ List<FeatureFunction> featureFunctions, boolean onlyTrue) {
+ /*
+ * Add OOV rules; This should be called after the manual constraints have
+ * been set up.
+ */
+ HashSet<Integer> words = new HashSet<Integer>();
+ for (Node<Token> node : inputLattice) {
+ for (Arc<Token> arc : node.getOutgoingArcs()) {
+ // create a rule, but do not add into the grammar trie
+ // TODO: which grammar should we use to create an OOV rule?
+ int sourceWord = arc.getLabel().getWord();
+ if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
+ || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
+ continue;
+
+ // Determine if word is actual OOV.
+ if (onlyTrue && ! Vocabulary.hasId(sourceWord))
+ continue;
+
+ words.add(sourceWord);
+ }
+ }
+
+ for (int sourceWord: words)
+ grammar.addOOVRules(sourceWord, featureFunctions);
+
+ // Sort all the rules (not much to actually do, this just marks it as sorted)
+ grammar.sortGrammar(featureFunctions);
+ }
+ }
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1bb8a203/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
index 0000000,06252a1..8f90d1b
mode 000000,100644..100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@@ -1,0 -1,120 +1,120 @@@
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.ff.tm;
+
+ import java.util.List;
+
+ import org.apache.joshua.decoder.ff.FeatureFunction;
+
+ /**
+ * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+ public interface Grammar {
+
+ /**
+ * Gets the root of the <code>Trie</code> backing this grammar.
+ * <p>
+ * <em>Note</em>: This method should run as a small constant-time function.
+ *
+ * @return the root of the <code>Trie</code> backing this grammar
+ */
+ Trie getTrieRoot();
+
+ /**
+ * After calling this method, the rules in this grammar are guaranteed to be sorted based on the
+ * latest feature function values.
+ * <p>
+ * Cube-pruning requires that the grammar be sorted based on the latest feature functions.
+ *
+ * @param models list of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ void sortGrammar(List<FeatureFunction> models);
+
+ /**
+ * Determines whether the rules in this grammar have been sorted based on the latest feature
+ * function values.
+ * <p>
+ * This method is needed for the cube-pruning algorithm.
+ *
+ * @return <code>true</code> if the rules in this grammar have been sorted based on the latest
+ * feature function values, <code>false</code> otherwise
+ */
+ boolean isSorted();
+
+ /**
+ * Returns whether this grammar has any valid rules for covering a particular span of a sentence.
+ * Hiero's "glue" grammar will only say True if the span is longer than our span limit, and is
+ * anchored at startIndex==0. Hiero's "regular" grammar will only say True if the span is less
+ * than the span limit. Other grammars, e.g. for rule-based systems, may have different behaviors.
+ *
+ * @param startIndex Indicates the starting index of a phrase in a source input phrase, or a
+ * starting node identifier in a source input lattice
+ * @param endIndex Indicates the ending index of a phrase in a source input phrase, or an ending
+ * node identifier in a source input lattice
+ * @param pathLength Length of the input path in a source input lattice. If a source input phrase
+ * is used instead of a lattice, this value will likely be ignored by the underlying
+ * implementation, but would normally be defined as <code>endIndex-startIndex</code>
+ * @return true if there is a rule for this span
+ */
+ boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength);
+
+ /**
+ * Gets the number of rules stored in the grammar.
+ *
+ * @return the number of rules stored in the grammar
+ */
+ int getNumRules();
+
+ /**
+ * Returns the number of dense features.
+ *
+ * @return the number of dense features
+ */
+ int getNumDenseFeatures();
+
+ /**
+ * Return the grammar's owner.
+ * @return grammar owner
+ */
- int getOwner();
++ OwnerId getOwner();
+
+ /**
+ * Return the maximum source phrase length (terminals + nonterminals)
+ * @return the maximum source phrase length
+ */
+ int getMaxSourcePhraseLength();
+
+ /**
+ * Add an OOV rule for the requested word for the grammar.
+ *
+ * @param word input word to add rules to
+ * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ void addOOVRules(int word, List<FeatureFunction> featureFunctions);
+
+ /**
+ * Add a rule to the grammar.
+ *
+ * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
+ */
+ void addRule(Rule rule);
+ }