You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:27:22 UTC
[66/66] incubator-joshua git commit: JOSHUA-252 Make it possible to
use Maven to build Joshua
JOSHUA-252 Make it possible to use Maven to build Joshua
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/ab5bb42c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/ab5bb42c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/ab5bb42c
Branch: refs/heads/JOSHUA-252
Commit: ab5bb42c3a5067521e0ea3e842611ce54a726782
Parents: 7f824b4
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Sun May 15 23:31:01 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Sun May 15 23:31:01 2016 -0700
----------------------------------------------------------------------
.../org/apache/joshua/corpus/SymbolTable.java | 330 ++++++++++++++++++
.../joshua/decoder/ff/ArityPhrasePenalty.java | 12 +
.../joshua/decoder/ff/FeatureFunction.java | 21 +-
.../joshua/decoder/ff/LabelCombinationFF.java | 12 +
.../joshua/decoder/ff/LabelSubstitutionFF.java | 12 +
.../apache/joshua/decoder/ff/OOVPenalty.java | 12 +
.../apache/joshua/decoder/ff/PhraseModel.java | 12 +
.../apache/joshua/decoder/ff/PhrasePenalty.java | 12 +
.../apache/joshua/decoder/ff/RuleCountBin.java | 12 +
.../org/apache/joshua/decoder/ff/RuleFF.java | 12 +
.../apache/joshua/decoder/ff/RuleLength.java | 2 +-
.../org/apache/joshua/decoder/ff/RuleShape.java | 12 +
.../apache/joshua/decoder/ff/SourcePathFF.java | 12 +
.../apache/joshua/decoder/ff/TargetBigram.java | 12 +
.../apache/joshua/decoder/ff/WordPenalty.java | 12 +
.../decoder/ff/fragmentlm/FragmentLMFF.java | 12 +
.../apache/joshua/decoder/ff/lm/AbstractLM.java | 133 ++++++++
.../apache/joshua/decoder/ff/lm/ArpaFile.java | 335 +++++++++++++++++++
.../apache/joshua/decoder/ff/lm/ArpaNgram.java | 73 ++++
.../joshua/decoder/ff/lm/LanguageModelFF.java | 12 +
.../joshua/decoder/ff/lm/buildin_lm/TrieLM.java | 332 ++++++++++++++++++
.../decoder/ff/lm/buildin_lm/package-info.java | 19 ++
.../joshua/decoder/ff/phrase/Distortion.java | 12 +
.../ff/similarity/EdgePhraseSimilarityFF.java | 12 +
.../joshua/decoder/ff/tm/BilingualRule.java | 167 +++++++++
.../joshua/decoder/ff/tm/MonolingualRule.java | 315 +++++++++++++++++
.../java/org/apache/joshua/lattice/Lattice.java | 106 +++++-
.../java/org/apache/joshua/metrics/BLEU.java | 70 ++--
.../org/apache/joshua/metrics/BLEU_SBP.java | 4 +-
.../apache/joshua/metrics/GradeLevelBLEU.java | 18 +-
.../joshua/metrics/MinimumChangeBLEU.java | 8 +-
.../java/org/apache/joshua/metrics/Precis.java | 26 +-
.../org/apache/joshua/metrics/SourceBLEU.java | 2 +-
.../util/quantization/BooleanQuantizer.java | 45 +++
.../joshua/util/quantization/Quantizer.java | 45 +++
.../quantization/QuantizerConfiguration.java | 119 +++++++
.../util/quantization/QuantizerFactory.java | 50 +++
.../util/quantization/StatelessQuantizer.java | 38 +++
.../joshua/util/quantization/package-info.java | 19 ++
.../apache/joshua/corpus/CorpusArrayTest.java | 304 +++++++++--------
.../apache/joshua/corpus/VocabularyTest.java | 2 -
.../joshua/corpus/vocab/VocabularyTest.java | 110 +++---
.../joshua/decoder/DecoderThreadTest.java | 65 ++--
.../decoder/ff/ArityPhrasePenaltyFFTest.java | 128 +++----
.../joshua/decoder/ff/lm/ArpaFileTest.java | 48 +--
.../org/apache/joshua/packed/CountRules.java | 2 +-
.../org/apache/joshua/packed/PrintRules.java | 6 +-
.../org/apache/joshua/packed/VocabTest.java | 3 +-
.../system/MultithreadedTranslationTests.java | 48 ++-
.../system/StructuredTranslationTest.java | 12 +-
.../org/apache/joshua/util/io/BinaryTest.java | 7 +-
.../java/org/apache/joshua/zmert/BLEUTest.java | 10 +-
52 files changed, 2786 insertions(+), 428 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/corpus/SymbolTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/corpus/SymbolTable.java b/src/main/java/org/apache/joshua/corpus/SymbolTable.java
new file mode 100644
index 0000000..d8b1694
--- /dev/null
+++ b/src/main/java/org/apache/joshua/corpus/SymbolTable.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus;
+
+import java.util.Collection;
+
+/**
+ * Represents a symbol table capable of mapping between strings and
+ * symbols.
+ *
+ * @author Lane Schwartz
+ * @author Zhifei Li
+ * @version $LastChangedDate: 2009-11-24 23:07:43 -0600 (Tue, 24 Nov 2009) $
+ */
+public interface SymbolTable {
+
+ //TODO Remove all hard-coded references to nonterminals
+
+ /**
+ * The unknown word's ID will be the size of the vocabulary,
+ * ensuring that it is outside of the vocabulary. Note that
+ * for vocabularies which have not been fixed yet, this
+ * means the actual value is volatile and therefore a word
+ * ID can only be compared against UNKNOWN_WORD at the time
+ * the word ID is generated (otherwise unknown words can
+ * become "known" if new words are added to the vocabulary
+ * before testing).
+ * <p>
+ * Negative IDs are reserved for non-terminals.
+ *
+ * Zero is reserved as the UNKNOWN_WORD.
+ */
+ int UNKNOWN_WORD = 1;
+
+ /** String representation for out-of-vocabulary words. */
+ String UNKNOWN_WORD_STRING = "<unk>";
+
+ /**
+ * Integer representation of the bare (non-indexed) nonterminal X,
+ * which represents a wild-card gap in a phrase.
+ * <p>
+ * All nonterminals are guaranteed to be represented by negative integers.
+ */
+ int X = -1;
+
+ /**
+ * String representation of the bare (non-indexed) nonterminal X,
+ * which represents a wild-card gap in a phrase.
+ */
+ String X_STRING = "[X]";
+
+
+
+ /**
+ * String representation of the nonterminal X with index 1,
+ * which represents a wild-card gap in a phrase.
+ */
+ String X1_STRING = "[X,1]";
+
+
+
+ /**
+ * String representation of the nonterminal X with index 2,
+ * which represents a wild-card gap in a phrase.
+ */
+ String X2_STRING = "[X,2]";
+
+ /**
+ * Integer representation of the nonterminal S.
+ * <p>
+ * All nonterminals are guaranteed to be represented by negative integers.
+ */
+ int S = -4;
+
+ /**
+ * String representation of the nonterminal S..
+ */
+ String S_STRING = "[S]";
+
+ /**
+ * Integer representation of the nonterminal X with index 1,
+ * which represents a wild-card gap in a phrase.
+ * <p>
+ * All nonterminals are guaranteed to be represented by negative integers.
+ */
+ int S1 = -5;
+
+ /**
+ * String representation of the nonterminal X with index 2,
+ * which represents a wild-card gap in a phrase.
+ */
+ String S1_STRING = "[S,1]";
+
+ /**
+ * Gets a unique integer identifier for the nonterminal.
+ * <p>
+ * The integer returned is guaranteed to be a negative number.
+ *
+ * If the nonterminal is {@link #X_STRING},
+ * then the value returned must be {@link #X}.
+ *
+ * Otherwise, the value returned must be a negative number
+ * whose value is less than {@link X}.
+ *
+ * @param nonterminal Nonterminal symbol
+ * @return a unique integer identifier for the nonterminal
+ */
+ int addNonterminal(String nonterminal);
+
+ /**
+ * Gets a unique integer identifier for the terminal.
+ *
+ * @param terminal Terminal symbol
+ * @return a unique integer identifier for the terminal
+ */
+ int addTerminal(String terminal);
+
+ /**
+ * Gets the unique integer identifiers for the words.
+ *
+ * @param words Array of symbols
+ * @return the unique integer identifiers for the words
+ */
+ int[] addTerminals(String[] words);
+
+ /**
+ * Gets the unique integer identifiers for the words
+ * in the sentence.
+ *
+ * @param sentence Space-delimited string of symbols
+ * @return the unique integer identifiers for the words
+ * in the sentence
+ */
+ int[] addTerminals(String sentence);
+
+ /**
+ * Gets an integer identifier for the word.
+ * <p>
+ * If the word is in the vocabulary, the integer returned
+ * will uniquely identify that word.
+ * <p>
+ * If the word is not in the vocabulary, the integer returned
+ * by <code>getUnknownWordID</code> may be returned.
+ *
+ * Alternatively, implementations may, if they choose, add
+ * unknown words and assign them a symbol ID instead of
+ * returning <code>getUnknownWordID</code>.
+ *
+ * @see #getUnknownWordID
+ * @return the unique integer identifier for wordString,
+ * or the result of <code>getUnknownWordID<code>
+ * if wordString is not in the vocabulary
+ */
+ int getID(String wordString);
+
+ /**
+ * Gets the integer identifiers for all words in the provided
+ * sentence.
+ * <p>
+ * The sentence will be split (on spaces) into words, then
+ * the integer identifier for each word will be retrieved
+ * using <code>getID</code>.
+ *
+ * @see #getID(String)
+ * @param sentence String of words, separated by spaces.
+ * @return Array of integer identifiers for each word in
+ * the sentence
+ */
+ int[] getIDs(String sentence);
+
+ /**
+ * Gets the String that corresponds to the specified integer
+ * identifier.
+ * <p>
+ * If the identifier is in the symbol vocabulary, the String
+ * returned will correspond to that identifier.
+ *
+ * Otherwise, the String returned by <code>getUnknownWord<code>
+ * will be returned.
+ *
+ * @return the String that corresponds to the specified
+ * integer identifier, or the result of
+ * <code>getUnknownWord</code> if the identifier
+ * does not correspond to a word in the vocabulary
+ */
+ String getTerminal(int wordID);
+
+ /**
+ * Gets the String that corresponds to the specified integer
+ * identifier.
+ * <p>
+ * This method can be called for terminals or nonterminals.
+ *
+ * @param tokenID Integer identifier
+ * @return the String that corresponds to the specified
+ * integer identifier
+ */
+ String getWord(int tokenID);
+
+ /**
+ * Gets the String that corresponds to the sequence of
+ * specified integer identifiers.
+ *
+ * @param ids Sequence of integer identifiers
+ * @return the String that corresponds to the sequence of
+ * specified integer identifiers
+ */
+ String getWords(int[] ids);
+
+ /**
+ *
+ * @param wordIDs
+ * @return
+ */
+ String getTerminals(int[] wordIDs);
+
+ /**
+ * Gets a collection over all symbol identifiers for the
+ * vocabulary.
+ *
+ * @return a collection over all symbol identifiers for the
+ * vocabulary
+ */
+ Collection<Integer> getAllIDs();
+
+ /**
+ * Gets the list of all words represented by this vocabulary.
+ *
+ * @return the list of all words represented by this
+ * vocabulary
+ */
+ Collection<String> getWords();
+
+ /**
+ * Gets the number of unique words in the vocabulary.
+ *
+ * @return the number of unique words in the vocabulary.
+ */
+ int size();
+
+ /**
+ * Gets the integer symbol representation of the unknown
+ * word.
+ *
+ * @return the integer symbol representation of the unknown
+ * word.
+ */
+ int getUnknownWordID();
+
+ /**
+ * Gets the string representation of the unknown word.
+ *
+ * @return the string representation of the unknown word.
+ */
+ String getUnknownWord();
+
+ /**
+ * Returns <code>true</code> if the symbol id represents a
+ * nonterminal, <code>false</code> otherwise.
+ *
+ * @param id
+ * @return <code>true</code> if the symbol id represents a
+ * nonterminal, <code>false</code> otherwise.
+ */
+ boolean isNonterminal(int id);
+
+ /**
+ * Gets the lowest-valued allowable terminal symbol id in
+ * this table.
+ *
+ * @return the lowest-valued allowable terminal symbol id
+ * in this table.
+ */
+ int getLowestID();
+
+
+ /**
+ * Gets the highest-valued allowable terminal symbol id in
+ * this table.
+ * <p>
+ * NOTE: This may or may not return the same value as
+ * <code>size</code>.
+ *
+ * @return the highest-valued allowable terminal symbol id
+ * in this table.
+ */
+ int getHighestID();
+
+ /**
+ *
+ *
+ * @param id
+ * @return
+ */
+ int getTargetNonterminalIndex(int id);//first convert id to its String mapping, then call the function below
+
+ /**
+ *
+ *
+ * @param word
+ * @return
+ */
+ int getTargetNonterminalIndex(String word);
+
+ /**
+ *
+ *
+ * @param wordIDs
+ * @param ntIndexIncrements
+ * @return
+ */
+ String getWords(int[] wordIDs, boolean ntIndexIncrements);
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java b/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java
index bb57a6e..25f363d 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/ArityPhrasePenalty.java
@@ -69,4 +69,16 @@ public class ArityPhrasePenalty extends StatelessFF {
return null;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java b/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
index fc1e15b..c6112e5 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/FeatureFunction.java
@@ -72,7 +72,7 @@ public abstract class FeatureFunction {
* names, for templates that define multiple features.
*/
protected String name = null;
-
+
/*
* The list of features each function can contribute, along with the dense feature IDs.
*/
@@ -93,14 +93,14 @@ public abstract class FeatureFunction {
* instantiated
*/
protected FeatureVector weights;
-
+
/* The config */
protected JoshuaConfiguration config;
public String getName() {
return name;
}
-
+
// Whether the feature has state.
public abstract boolean isStateful();
@@ -112,7 +112,7 @@ public abstract class FeatureFunction {
this.parsedArgs = FeatureFunction.parseArgs(args);
}
-
+
/**
* Any feature function can use this to report dense features names to the master code. The
* parameter tells the feature function the index of the first available dense feature ID; the feature
@@ -304,6 +304,15 @@ public abstract class FeatureFunction {
}
/**
+ * It is used when initializing translation grammars (for
+ * pruning purpose, and to get stateless logP for each rule).
+ * This is also required to sort the rules (required by Cube-pruning).
+ */
+ public abstract double estimateLogP(Rule rule, int sentID);
+
+ public abstract double getWeight();
+
+ /**
* Accumulator objects allow us to generalize feature computation.
* ScoreAccumulator takes (feature,value) pairs and simple stores the weighted
* sum (for decoding). FeatureAccumulator records the named feature values
@@ -326,7 +335,7 @@ public abstract class FeatureFunction {
public void add(String name, float value) {
score += value * weights.getSparse(name);
}
-
+
@Override
public void add(int id, float value) {
score += value * weights.getDense(id);
@@ -348,7 +357,7 @@ public abstract class FeatureFunction {
public void add(String name, float value) {
features.increment(name, value);
}
-
+
@Override
public void add(int id, float value) {
features.increment(id, value);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java b/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
index 1c02853..f80e0b7 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/LabelCombinationFF.java
@@ -60,4 +60,16 @@ public class LabelCombinationFF extends StatelessFF {
return null;
}
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java b/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
index fb64b26..2c247fe 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/LabelSubstitutionFF.java
@@ -129,4 +129,16 @@ public class LabelSubstitutionFF extends StatelessFF {
return null;
}
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
index 96999c2..0d0e0f7 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/OOVPenalty.java
@@ -102,4 +102,16 @@ public class OOVPenalty extends StatelessFF {
private float getValue(int lhs) {
return oovWeights.containsKey(lhs) ? oovWeights.get(lhs) : defaultValue;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java b/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
index 120ab4b..62792dc 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/PhraseModel.java
@@ -132,4 +132,16 @@ public class PhraseModel extends StatelessFF {
public String toString() {
return name + " " + Vocabulary.word(ownerID);
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java b/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
index 3c38e60..a185286 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/PhrasePenalty.java
@@ -83,4 +83,16 @@ public class PhrasePenalty extends StatelessFF {
return weights.getDense(denseFeatureIndex) * value;
return 0.0f;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java b/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
index 4d99668..e75ea12 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleCountBin.java
@@ -67,4 +67,16 @@ public class RuleCountBin extends StatelessFF {
return null;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
index 1ff6b80..bc6d67b 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleFF.java
@@ -85,4 +85,16 @@ public class RuleFF extends StatelessFF {
}
return ruleString.replaceAll("[ =]", "~");
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
index e02b12b..59b1c20 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleLength.java
@@ -31,7 +31,7 @@ import org.apache.joshua.decoder.segment_file.Sentence;
* This feature computes three feature templates: a feature indicating the length of the rule's
* source side, its target side, and a feature that pairs them.
*/
-public class RuleLength extends StatelessFF {
+public abstract class RuleLength extends StatelessFF {
public RuleLength(FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, "RuleLength", args, config);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
index ac5ffa4..a514021 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/RuleShape.java
@@ -70,4 +70,16 @@ public class RuleShape extends StatelessFF {
return null;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java b/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
index 22eaa8f..d757303 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/SourcePathFF.java
@@ -60,4 +60,16 @@ public final class SourcePathFF extends StatelessFF {
acc.add(denseFeatureIndex, sourcePath.getPathCost());
return null;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java b/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
index 689df3c..5661ce7 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
@@ -212,4 +212,16 @@ public class TargetBigram extends StatefulFF {
return sb.substring(0, sb.length() - 1);
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
index 0063cc4..2a40088 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/WordPenalty.java
@@ -75,4 +75,16 @@ public final class WordPenalty extends StatelessFF {
return weights.getDense(denseFeatureIndex) * OMEGA * (rule.getEnglish().length - rule.getArity());
return 0.0f;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
index 8f474ac..e438778 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
@@ -353,4 +353,16 @@ public class FragmentLMFF extends StatefulFF {
}
}
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java
new file mode 100644
index 0000000..79560fd
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/AbstractLM.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Support;
+import org.apache.joshua.corpus.SymbolTable;
+
+
+import java.util.List;
+
+/**
+ * This class implements NGramLanguageModel by creating wrappers
+ * around the necessary functions to capture common errors. Most
+ * methods are declared final, in an attempt to limit what subclasses
+ * may be defined.
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ * @version $LastChangedDate: 2009-12-30 10:10:38 -0600 (Wed, 30 Dec 2009) $
+ */
+public abstract class AbstractLM extends DefaultNGramLanguageModel {
+
+ public AbstractLM(int symbolTable, int order) {
+ super(symbolTable, order);
+ }
+
+
+ public final double sentenceLogProbability(
+ List<Integer> sentence, int order, int startIndex
+ ) {
+ //return super.sentenceLogProbability(sentence.stream().toArray(int[]::new) , order, startIndex);
+ return (Double) null;
+ }
+
+
+ public final float ngramLogProbability(int[] ngram) {
+ return super.ngramLogProbability(ngram);
+ }
+
+
+ public final float ngramLogProbability(int[] ngram, int order) {
+ if (ngram.length > order) {
+ throw new RuntimeException("ngram length is greather than the max order");
+ }
+ // if (ngram.length==1 && "we".equals(symbolTable.getWord(ngram[0]))) {
+ // System.err.println("Something weird is about to happen");
+ // }
+
+ int historySize = ngram.length - 1;
+ if (historySize >= order || historySize < 0) {
+ // BUG: use logger or exception. Don't zero default
+ throw new RuntimeException("Error: history size is " + historySize);
+ // return 0;
+ }
+ double probability = ngramLogProbability_helper(ngram, order);
+// if (probability < -JoshuaConfiguration.lm_ceiling_cost) {
+// probability = -JoshuaConfiguration.lm_ceiling_cost;
+// }
+ return (float) probability;
+ }
+
+ protected abstract float ngramLogProbability_helper(int[] ngram, int order);
+
+
+ /**
+ * @deprecated this function is much slower than the int[]
+ * version
+ */
+ @Deprecated
+ public final double logProbOfBackoffState(List<Integer> ngram, int order, int qtyAdditionalBackoffWeight) {
+ return logProbabilityOfBackoffState(
+ Support.subIntArray(ngram, 0, ngram.size()),
+ order, qtyAdditionalBackoffWeight);
+ }
+
+
+ public final double logProbabilityOfBackoffState(int[] ngram, int order, int qtyAdditionalBackoffWeight) {
+ if (ngram.length > order) {
+ throw new RuntimeException("ngram length is greather than the max order");
+ }
+ if (ngram[ngram.length-1] != LanguageModelFF.LM_INDEX) {
+ throw new RuntimeException("last wrd is not <bow>");
+ }
+ if (qtyAdditionalBackoffWeight > 0) {
+ return logProbabilityOfBackoffState_helper(
+ ngram, order, qtyAdditionalBackoffWeight);
+ } else {
+ return 0.0;
+ }
+ }
+
+
+ protected abstract double logProbabilityOfBackoffState_helper(
+ int[] ngram, int order, int qtyAdditionalBackoffWeight);
+
+
+ // BUG: We should have different classes based on the configuration in use
+ public int[] leftEquivalentState(int[] originalState, int order,
+ double[] cost
+ ) {
+// if (JoshuaConfiguration.use_left_equivalent_state)
+// throw new UnsupportedOperationException("getLeftEquivalentState is not overwritten by a concrete class");
+
+ return originalState;
+ }
+
+
+ // BUG: We should have different classes based on the configuration in use
+ public int[] rightEquivalentState(int[] originalState, int order) {
+// if ( !JoshuaConfiguration.use_right_equivalent_state
+// || originalState.length != this.ngramOrder-1) {
+ return originalState;
+// } else {
+// throw new UnsupportedOperationException("getRightEquivalentState is not overwritten by a concrete class");
+// }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java
new file mode 100644
index 0000000..5e66afa
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaFile.java
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Scanner;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+
+/**
+ * Utility class for reading ARPA language model files.
+ *
+ * @author Lane Schwartz
+ */
+public class ArpaFile implements Iterable<ArpaNgram> {
+
+ /** Logger for this class. */
+ private static final Logger logger =
+ Logger.getLogger(ArpaFile.class.getName());
+
+ /** Regular expression representing a blank line. */
+ public static final Regex BLANK_LINE = new Regex("^\\s*$");
+
+ /**
+ * Regular expression representing a line
+ * starting a new section of n-grams in an ARPA language model file.
+ */
+ public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$");
+
+ /**
+ * Regular expression representing a line
+ * ending an ARPA language model file.
+ */
+ public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$");
+
+ /** ARPA file for this object. */
+ private final File arpaFile;
+
+ /** The vocabulary associated with this object. */
+ private final Vocabulary vocab;
+
+ /**
+ * Constructs an object that represents an ARPA language model file.
+ *
+ * @param arpaFileName File name of an ARPA language model file
+ * @param vocab Symbol table to be used by this object
+ */
+ public ArpaFile(String arpaFileName, Vocabulary vocab) {
+ this.arpaFile = new File(arpaFileName);
+ this.vocab = vocab;
+ }
+
+ public ArpaFile(String arpaFileName) throws IOException {
+ this.arpaFile = new File(arpaFileName);
+ this.vocab = new Vocabulary();
+
+ // final Scanner scanner = new Scanner(arpaFile);
+
+ // // Eat initial header lines
+ // while (scanner.hasNextLine()) {
+ // String line = scanner.nextLine();
+ // logger.finest("Discarding line: " + line);
+ // if (NGRAM_HEADER.matches(line)) {
+ // break;
+ // }
+ // }
+
+ // int ngramOrder = 1;
+
+ LineReader grammarReader = new LineReader(arpaFileName);
+
+ try {
+ for (String line : grammarReader) {
+
+
+ // while (scanner.hasNext()) {
+ //
+ // String line = scanner.nextLine();
+
+ String[] parts = Regex.spaces.split(line);
+ if (parts.length > 1) {
+ String[] words = Regex.spaces.split(parts[1]);
+
+ for (String word : words) {
+ if (logger.isLoggable(Level.FINE)) logger.fine("Adding to vocab: " + word);
+ Vocabulary.addAll(word);
+ }
+
+ } else {
+ logger.info(line);
+ }
+
+ }
+ } finally {
+ grammarReader.close();
+ }
+
+ //
+ // boolean lineIsHeader = NGRAM_HEADER.matches(line);
+ //
+ // while (lineIsHeader || BLANK_LINE.matches(line)) {
+ //
+ // if (lineIsHeader) {
+ // ngramOrder++;
+ // }
+ //
+ // if (scanner.hasNext()) {
+ // line = scanner.nextLine().trim();
+ // lineIsHeader = NGRAM_HEADER.matches(line);
+ // } else {
+ // logger.severe("Ran out of lines!");
+ // return;
+ // }
+ // }
+
+
+ //
+ // // Add word to vocab
+ // if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]);
+ // vocab.addTerminal(parts[ngramOrder]);
+ //
+ // // Add context words to vocab
+ // for (int i=1; i<ngramOrder; i++) {
+ // if (logger.isLoggable(Level.FINE)) logger.fine("Adding context word to vocab: " + parts[i]);
+ // vocab.addTerminal(parts[i]);
+ // }
+
+ // }
+
+ logger.info("Done constructing ArpaFile");
+
+ }
+
+ /**
+ * Gets the {@link org.apache.joshua.corpus.Vocabulary}
+ * associated with this object.
+ *
+ * @return the symbol table associated with this object
+ */
+ public Vocabulary getVocab() {
+ return vocab;
+ }
+
+ /**
+ * Gets the total number of n-grams
+ * in this ARPA language model file.
+ *
+ * @return total number of n-grams
+ * in this ARPA language model file
+ */
+ @SuppressWarnings("unused")
+ public int size() {
+
+ logger.fine("Counting n-grams in ARPA file");
+ int count=0;
+
+ for (ArpaNgram ngram : this) {
+ count++;
+ }
+ logger.fine("Done counting n-grams in ARPA file");
+
+ return count;
+ }
+
+ public int getOrder() throws FileNotFoundException {
+
+ Pattern pattern = Pattern.compile("^ngram (\\d+)=\\d+$");
+ if (logger.isLoggable(Level.FINEST)) logger.finest("Pattern is " + pattern.toString());
+ @SuppressWarnings("resource")
+ final Scanner scanner = new Scanner(arpaFile);
+
+ int order = 0;
+
+ // Eat initial header lines
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+
+ if (NGRAM_HEADER.matches(line)) {
+ break;
+ } else {
+ Matcher matcher = pattern.matcher(line);
+ if (matcher.matches()) {
+ if (logger.isLoggable(Level.FINEST)) logger.finest("DOES match: \'" + line + "\'");
+ order = Integer.valueOf(matcher.group(1));
+ } else if (logger.isLoggable(Level.FINEST)) {
+ logger.finest("Doesn't match: \'" + line + "\'");
+ }
+ }
+ }
+
+ return order;
+ }
+
+ /**
+ * Gets an iterator capable of iterating
+ * over all n-grams in the ARPA file.
+ *
+ * @return an iterator capable of iterating
+ * over all n-grams in the ARPA file
+ */
+ @SuppressWarnings("resource")
+ public Iterator<ArpaNgram> iterator() {
+
+ try {
+ final Scanner scanner;
+
+ if (arpaFile.getName().endsWith("gz")) {
+ InputStream in = new GZIPInputStream(
+ new FileInputStream(arpaFile));
+ scanner = new Scanner(in);
+ } else {
+ scanner = new Scanner(arpaFile);
+ }
+
+ // Eat initial header lines
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ logger.finest("Discarding line: " + line);
+ if (NGRAM_HEADER.matches(line)) {
+ break;
+ }
+ }
+
+ return new Iterator<ArpaNgram>() {
+
+ String nextLine = null;
+ int ngramOrder = 1;
+ // int id = 0;
+
+ public boolean hasNext() {
+
+ if (scanner.hasNext()) {
+
+ String line = scanner.nextLine();
+
+ boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
+
+ while (lineIsHeader || BLANK_LINE.matches(line)) {
+
+ if (lineIsHeader) {
+ ngramOrder++;
+ }
+
+ if (scanner.hasNext()) {
+ line = scanner.nextLine().trim();
+ lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
+ } else {
+ nextLine = null;
+ return false;
+ }
+ }
+
+ nextLine = line;
+ return true;
+
+ } else {
+ nextLine = null;
+ return false;
+ }
+
+ }
+
+ public ArpaNgram next() {
+ if (nextLine!=null) {
+
+ String[] parts = Regex.spaces.split(nextLine);
+
+ float value = Float.valueOf(parts[0]);
+
+ int word = Vocabulary.id(parts[ngramOrder]);
+
+ int[] context = new int[ngramOrder-1];
+ for (int i=1; i<ngramOrder; i++) {
+ context[i-1] = Vocabulary.id(parts[i]);
+ }
+
+ float backoff;
+ if (parts.length > ngramOrder+1) {
+ backoff = Float.valueOf(parts[parts.length-1]);
+ } else {
+ backoff = ArpaNgram.DEFAULT_BACKOFF;
+ }
+
+ nextLine = null;
+ return new ArpaNgram(word, context, value, backoff);
+
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ };
+ } catch (FileNotFoundException e) {
+ logger.severe(e.toString());
+ return null;
+ } catch (IOException e) {
+ logger.severe(e.toString());
+ return null;
+ }
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java
new file mode 100644
index 0000000..d0077d1
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/ArpaNgram.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm;
+
+/**
+ * Represents a single n-gram line
+ * from an ARPA language model file.
+ *
+ * @author Lane Schwartz
+ */
+public class ArpaNgram {
+
+
+ /** Indicates an invalid probability value. */
+ public static final float INVALID_VALUE = Float.NaN;
+
+ /** Default backoff value. */
+ public static final float DEFAULT_BACKOFF = 0.0f;
+
+ private final int word;
+ private final int[] context;
+ private final float value;
+ private final float backoff;
+ // private final int id;
+
+ public ArpaNgram(int word, int[] context, float value, float backoff) {
+ this.word = word;
+ this.context = context;
+ this.value = value;
+ this.backoff = backoff;
+ // this.id = id;
+ }
+
+ // public int getID() {
+ // return id;
+ // }
+
+ public int order() {
+ return context.length + 1;
+ }
+
+ public int getWord() {
+ return word;
+ }
+
+ public int[] getContext() {
+ return context;
+ }
+
+ public float getValue() {
+ return value;
+ }
+
+ public float getBackoff() {
+ return backoff;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java b/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
index d69d552..f2daffd 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -517,4 +517,16 @@ public class LanguageModelFF extends StatefulFF {
public static void resetLmIndex() {
LM_INDEX = 0;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
new file mode 100644
index 0000000..654561c
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.joshua.corpus.SymbolTable;
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.lm.AbstractLM;
+import org.apache.joshua.decoder.ff.lm.ArpaFile;
+import org.apache.joshua.decoder.ff.lm.ArpaNgram;
+import org.apache.joshua.util.Bits;
+import org.apache.joshua.util.Regex;
+
+/**
+ * Relatively memory-compact language model
+ * stored as a reversed-word-order trie.
+ * <p>
+ * The trie itself represents language model context.
+ * <p>
+ * Conceptually, each node in the trie stores a map
+ * from conditioning word to log probability.
+ * <p>
+ * Additionally, each node in the trie stores
+ * the backoff weight for that context.
+ *
+ * @author Lane Schwartz
+ * @see <a href="http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html">SRILM ngram-discount documentation</a>
+ */
+public class TrieLM extends AbstractLM { //DefaultNGramLanguageModel {
+
+ /** Logger for this class. */
+ private static Logger logger =
+ Logger.getLogger(TrieLM.class.getName());
+
+ /**
+ * Node ID for the root node.
+ */
+ private static final int ROOT_NODE_ID = 0;
+
+
+ /**
+ * Maps from (node id, word id for child) --> node id of child.
+ */
+ private final Map<Long,Integer> children;
+
+ /**
+ * Maps from (node id, word id for lookup word) -->
+ * log prob of lookup word given context
+ *
+ * (the context is defined by where you are in the tree).
+ */
+ private final Map<Long,Float> logProbs;
+
+ /**
+ * Maps from (node id) -->
+ * backoff weight for that context
+ *
+ * (the context is defined by where you are in the tree).
+ */
+ private final Map<Integer,Float> backoffs;
+
+ public TrieLM(Vocabulary vocab, String file) throws FileNotFoundException {
+ this(new ArpaFile(file,vocab));
+ }
+
+ /**
+ * Constructs a language model object from the specified ARPA file.
+ *
+ * @param arpaFile
+ * @throws FileNotFoundException
+ */
+ public TrieLM(ArpaFile arpaFile) throws FileNotFoundException {
+ super(arpaFile.getVocab().size(), arpaFile.getOrder());
+
+ int ngramCounts = arpaFile.size();
+ if (logger.isLoggable(Level.FINE)) logger.fine("ARPA file contains " + ngramCounts + " n-grams");
+
+ this.children = new HashMap<Long,Integer>(ngramCounts);
+ this.logProbs = new HashMap<Long,Float>(ngramCounts);
+ this.backoffs = new HashMap<Integer,Float>(ngramCounts);
+
+ int nodeCounter = 0;
+
+ int lineNumber = 0;
+ for (ArpaNgram ngram : arpaFile) {
+ lineNumber += 1;
+ if (lineNumber%100000==0) logger.info("Line: " + lineNumber);
+
+ if (logger.isLoggable(Level.FINEST)) logger.finest(ngram.order() + "-gram: (" + ngram.getWord() + " | " + Arrays.toString(ngram.getContext()) + ")");
+ int word = ngram.getWord();
+
+ int[] context = ngram.getContext();
+
+ {
+ // Find where the log prob should be stored
+ int contextNodeID = ROOT_NODE_ID;
+ {
+ for (int i=context.length-1; i>=0; i--) {
+ long key = Bits.encodeAsLong(contextNodeID, context[i]);
+ int childID;
+ if (children.containsKey(key)) {
+ childID = children.get(key);
+ } else {
+ childID = ++nodeCounter;
+ if (logger.isLoggable(Level.FINEST)) logger.finest("children.put(" + contextNodeID + ":"+context[i] + " , " + childID + ")");
+ children.put(key, childID);
+ }
+ contextNodeID = childID;
+ }
+ }
+
+ // Store the log prob for this n-gram at this node in the trie
+ {
+ long key = Bits.encodeAsLong(contextNodeID, word);
+ float logProb = ngram.getValue();
+ if (logger.isLoggable(Level.FINEST)) logger.finest("logProbs.put(" + contextNodeID + ":"+word + " , " + logProb);
+ this.logProbs.put(key, logProb);
+ }
+ }
+
+ {
+ // Find where the backoff should be stored
+ int backoffNodeID = ROOT_NODE_ID;
+ {
+ long backoffNodeKey = Bits.encodeAsLong(backoffNodeID, word);
+ int wordChildID;
+ if (children.containsKey(backoffNodeKey)) {
+ wordChildID = children.get(backoffNodeKey);
+ } else {
+ wordChildID = ++nodeCounter;
+ if (logger.isLoggable(Level.FINEST)) logger.finest("children.put(" + backoffNodeID + ":"+word + " , " + wordChildID + ")");
+ children.put(backoffNodeKey, wordChildID);
+ }
+ backoffNodeID = wordChildID;
+
+ for (int i=context.length-1; i>=0; i--) {
+ long key = Bits.encodeAsLong(backoffNodeID, context[i]);
+ int childID;
+ if (children.containsKey(key)) {
+ childID = children.get(key);
+ } else {
+ childID = ++nodeCounter;
+ if (logger.isLoggable(Level.FINEST)) logger.finest("children.put(" + backoffNodeID + ":"+context[i] + " , " + childID + ")");
+ children.put(key, childID);
+ }
+ backoffNodeID = childID;
+ }
+ }
+
+ // Store the backoff for this n-gram at this node in the trie
+ {
+ float backoff = ngram.getBackoff();
+ if (logger.isLoggable(Level.FINEST)) logger.finest("backoffs.put(" + backoffNodeID + ":" +word+" , " + backoff + ")");
+ this.backoffs.put(backoffNodeID, backoff);
+ }
+ }
+
+ }
+ }
+
+
+ @Override
+ protected double logProbabilityOfBackoffState_helper(
+ int[] ngram, int order, int qtyAdditionalBackoffWeight
+ ) {
+ throw new UnsupportedOperationException("probabilityOfBackoffState_helper undefined for TrieLM");
+ }
+
+ @Override
+ protected float ngramLogProbability_helper(int[] ngram, int order) {
+
+// float logProb = (float) -JoshuaConfiguration.lm_ceiling_cost;//Float.NEGATIVE_INFINITY; // log(0.0f)
+ float backoff = 0.0f; // log(1.0f)
+
+ int i = ngram.length - 1;
+ int word = ngram[i];
+ i -= 1;
+
+ int nodeID = ROOT_NODE_ID;
+
+ while (true) {
+
+ {
+ long key = Bits.encodeAsLong(nodeID, word);
+ if (logProbs.containsKey(key)) {
+// logProb = logProbs.get(key);
+ backoff = 0.0f; // log(0.0f)
+ }
+ }
+
+ if (i < 0) {
+ break;
+ }
+
+ {
+ long key = Bits.encodeAsLong(nodeID, ngram[i]);
+
+ if (children.containsKey(key)) {
+ nodeID = children.get(key);
+
+ backoff += backoffs.get(nodeID);
+
+ i -= 1;
+
+ } else {
+ break;
+ }
+ }
+
+ }
+
+// double result = logProb + backoff;
+// if (result < -JoshuaConfiguration.lm_ceiling_cost) {
+// result = -JoshuaConfiguration.lm_ceiling_cost;
+// }
+//
+// return result;
+ return (Float) null;
+ }
+
+ public Map<Long,Integer> getChildren() {
+ return this.children;
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ logger.info("Constructing ARPA file");
+ ArpaFile arpaFile = new ArpaFile(args[0]);
+
+ logger.info("Getting symbol table");
+ Vocabulary vocab = arpaFile.getVocab();
+
+ logger.info("Constructing TrieLM");
+ TrieLM lm = new TrieLM(arpaFile);
+
+ int n = Integer.valueOf(args[2]);
+ logger.info("N-gram order will be " + n);
+
+ Scanner scanner = new Scanner(new File(args[1]));
+
+ LinkedList<String> wordList = new LinkedList<String>();
+ LinkedList<String> window = new LinkedList<String>();
+
+ logger.info("Starting to scan " + args[1]);
+ while (scanner.hasNext()) {
+
+ logger.info("Getting next line...");
+ String line = scanner.nextLine();
+ logger.info("Line: " + line);
+
+ String[] words = Regex.spaces.split(line);
+ wordList.clear();
+
+ wordList.add("<s>");
+ for (String word : words) {
+ wordList.add(word);
+ }
+ wordList.add("</s>");
+
+ ArrayList<Integer> sentence = new ArrayList<Integer>();
+ // int[] ids = new int[wordList.size()];
+ for (int i=0, size=wordList.size(); i<size; i++) {
+ sentence.add(vocab.id(wordList.get(i)));
+ // ids[i] = ;
+ }
+
+
+
+ while (! wordList.isEmpty()) {
+ window.clear();
+
+ {
+ int i=0;
+ for (String word : wordList) {
+ if (i>=n) break;
+ window.add(word);
+ i++;
+ }
+ wordList.remove();
+ }
+
+ {
+ int i=0;
+ int[] wordIDs = new int[window.size()];
+ for (String word : window) {
+ wordIDs[i] = vocab.id(word);
+ i++;
+ }
+
+ logger.info("logProb " + window.toString() + " = " + lm.ngramLogProbability(wordIDs, n));
+ }
+ }
+
+ double logProb = lm.sentenceLogProbability(sentence, n, 2);//.ngramLogProbability(ids, n);
+ double prob = Math.exp(logProb);
+
+ logger.info("Total logProb = " + logProb);
+ logger.info("Total prob = " + prob);
+ }
+
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
new file mode 100644
index 0000000..6c84703
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java b/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
index cf0af8b..c9a3214 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
@@ -68,4 +68,16 @@ public class Distortion extends StatelessFF {
return null;
}
+
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
index 41cac0d..6ac6b42 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
@@ -274,4 +274,16 @@ public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependen
return (count == 0 ? 0 : similarity / count);
}
+ @Override
+ public double estimateLogP(Rule rule, int sentID) {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
+ @Override
+ public double getWeight() {
+ // TODO Auto-generated method stub
+ return 0;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java b/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java
new file mode 100644
index 0000000..6e35e2d
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/BilingualRule.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Arrays;
+import java.util.Map;
+
+import org.apache.joshua.corpus.SymbolTable;
+
+
+/**
+ * Normally, the feature score in the rule should be *cost* (i.e.,
+ * -LogP), so that the feature weight should be positive
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ * @version $LastChangedDate: 2010-01-20 19:46:54 -0600 (Wed, 20 Jan 2010) $
+ */
+public class BilingualRule extends MonolingualRule {
+
+ private int[] english;
+
+ //===============================================================
+ // Constructors
+ //===============================================================
+
+ /**
+ * Constructs a new rule using the provided parameters. The
+ * owner and rule id for this rule are undefined.
+ *
+ * @param lhs Left-hand side of the rule.
+ * @param sourceRhs Source language right-hand side of the rule.
+ * @param targetRhs Target language right-hand side of the rule.
+ * @param featureScores Feature value scores for the rule.
+ * @param arity Number of nonterminals in the source language
+ * right-hand side.
+ * @param owner
+ * @param latticeCost
+ * @param ruleID
+ */
+ public BilingualRule(int lhs, int[] sourceRhs, int[] targetRhs, float[] featureScores, int arity, int owner, float latticeCost, int ruleID) {
+ super(lhs, sourceRhs, featureScores, arity, owner, latticeCost, ruleID);
+ this.english = targetRhs;
+ }
+
+ //called by class who does not care about lattice_cost, rule_id, and owner
+ public BilingualRule(int lhs, int[] sourceRhs, int[] targetRhs, float[] featureScores, int arity) {
+ super(lhs, sourceRhs, featureScores, arity);
+ this.english = targetRhs;
+ }
+
+
+ //===============================================================
+ // Attributes
+ //===============================================================
+
+ public final void setEnglish(int[] eng) {
+ this.english = eng;
+ }
+
+ public final int[] getEnglish() {
+ return this.english;
+ }
+
+
+ //===============================================================
+ // Serialization Methods
+ //===============================================================
+ // TODO: remove these methods
+
+ // Caching this method significantly improves performance
+ // We mark it transient because it is, though cf java.io.Serializable
+ private transient String cachedToString = null;
+
+ public String toString(Map<Integer,String> ntVocab, SymbolTable sourceVocab, SymbolTable targetVocab) {
+ if (null == this.cachedToString) {
+ StringBuffer sb = new StringBuffer("[");
+ sb.append(ntVocab.get(this.getLHS()));
+ sb.append("] ||| ");
+ sb.append(sourceVocab.getWords(this.getFrench(),true));
+ sb.append(" ||| ");
+ sb.append(targetVocab.getWords(this.english,false));
+ //sb.append(java.util.Arrays.toString(this.english));
+ sb.append(" |||");
+ for (int i = 0; i < this.getFeatureScores().length; i++) {
+ // sb.append(String.format(" %.12f", this.getFeatureScores()[i]));
+ sb.append(' ');
+ sb.append(Float.toString(this.getFeatureScores()[i]));
+ }
+ this.cachedToString = sb.toString();
+ }
+ return this.cachedToString;
+ }
+
+
+ //print the rule in terms of Integers
+ public String toString() {
+ if (null == this.cachedToString) {
+ StringBuffer sb = new StringBuffer();
+ sb.append(this.getClass().getName() + "@" + Integer.toHexString(System.identityHashCode(this)));
+ sb.append("~~~");
+ sb.append(this.getLHS());
+ sb.append(" ||| ");
+ sb.append(Arrays.toString(this.getFrench()));
+ sb.append(" ||| ");
+ sb.append(Arrays.toString(this.english));
+ sb.append(" |||");
+ for (int i = 0; i < this.getFeatureScores().length; i++) {
+ sb.append(String.format(" %.4f", this.getFeatureScores()[i]));
+ }
+ this.cachedToString = sb.toString();
+ }
+ return this.cachedToString;
+ }
+
+
+ public String toString(SymbolTable symbolTable) {
+ if (null == this.cachedToString) {
+ StringBuffer sb = new StringBuffer();
+ sb.append(symbolTable.getWord(this.getLHS()));
+ sb.append(" ||| ");
+ sb.append(symbolTable.getWords(this.getFrench()));
+ sb.append(" ||| ");
+ sb.append(symbolTable.getWords(this.english));
+ sb.append(" |||");
+ for (int i = 0; i < this.getFeatureScores().length; i++) {
+ sb.append(String.format(" %.4f", this.getFeatureScores()[i]));
+ }
+ this.cachedToString = sb.toString();
+ }
+ return this.cachedToString;
+ }
+
+ public String toStringWithoutFeatScores(SymbolTable symbolTable) {
+ StringBuffer sb = new StringBuffer();
+ if(symbolTable==null)
+ sb.append(this.getLHS());
+ else
+ sb.append(symbolTable.getWord(this.getLHS()));
+
+ return sb.append(" ||| ")
+ .append(convertToString(this.getFrench(), symbolTable))
+ .append(" ||| ")
+ .append(convertToString(this.getEnglish(), symbolTable))
+ .toString();
+ }
+
+
+
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java b/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java
new file mode 100644
index 0000000..812e669
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/MonolingualRule.java
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.joshua.corpus.SymbolTable;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * this class implements MonolingualRule
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ * @version $LastChangedDate: 2010-02-10 09:59:38 -0600 (Wed, 10 Feb 2010) $
+ */
+public class MonolingualRule extends Rule {
+
+ private static final Logger logger =
+ Logger.getLogger(MonolingualRule.class.getName());
+
+ //===============================================================
+ // Instance Fields
+ //===============================================================
+
+ /* The string format of Rule is:
+ * [Phrase] ||| french ||| english ||| feature scores
+ */
+ private int ruleID;
+ private int lhs; // tag of this rule
+ private int[] pFrench; //pointer to the RuleCollection, as all the rules under it share the same Source side
+ private int arity;
+ private float[] featScores; // the feature scores for this rule
+
+ /* a feature function will be fired for this rule
+ * only if the owner of the rule matches the owner of the feature function
+ */
+ private int owner;
+
+ // TODO: consider remove this from the general class, and
+ // create a new specific Rule class
+ private float latticeCost;
+
+ /**
+ * estimate_cost depends on rule itself: statelesscost +
+ * transition_cost(non-stateless/non-contexual* models),
+ * we need this variable in order to provide sorting for
+ * cube-pruning
+ */
+ private float est_cost = 0;
+
+ //===============================================================
+ // Static Fields
+ //===============================================================
+
+ // TODO: Ideally, we shouldn't have to have dummy rule IDs
+ // and dummy owners. How can this need be eliminated?
+ public static final int DUMMY_RULE_ID = 1;
+ public static final int DUMMY_OWNER = 1;
+
+
+ //===============================================================
+ // Constructors
+ //===============================================================
+
+ /**
+ * Constructs a new rule using the provided parameters. The
+ * owner and rule id for this rule are undefined.
+ *
+ * @param lhs Left-hand side of the rule.
+ * @param sourceRhs Source language right-hand side of the rule.
+ * @param featureScores Feature value scores for the rule.
+ * @param arity Number of nonterminals in the source language
+ * right-hand side.
+ * @param owner
+ * @param latticeCost
+ * @param ruleID
+ */
+ public MonolingualRule(int lhs, int[] sourceRhs, float[] featureScores, int arity, int owner, float latticeCost, int ruleID) {
+ this.lhs = lhs;
+ this.pFrench = sourceRhs;
+ this.featScores = featureScores;
+ this.arity = arity;
+ this.latticeCost = latticeCost;
+ this.ruleID = ruleID;
+ this.owner = owner;
+ }
+
+
+ // called by class who does not care about lattice_cost,
+ // rule_id, and owner
+ public MonolingualRule(int lhs_, int[] source_rhs, float[] feature_scores, int arity_) {
+ this.lhs = lhs_;
+ this.pFrench = source_rhs;
+ this.featScores = feature_scores;
+ this.arity = arity_;
+
+ //==== dummy values
+ this.latticeCost = 0;
+ this.ruleID = DUMMY_RULE_ID;
+ this.owner = DUMMY_OWNER;
+ }
+
+
+ //===============================================================
+ // Attributes
+ //===============================================================
+
+ public final void setRuleID(int id) { this.ruleID = id; }
+
+ public final int getRuleID() { return this.ruleID; }
+
+
+ public final void setArity(int arity) { this.arity = arity; }
+
+ public final int getArity() { return this.arity; }
+
+
+ public final void setOwner(int owner) { this.owner = owner; }
+
+ public final int getOwner() { return this.owner; }
+
+
+ public final void setLHS(int lhs) { this.lhs = lhs; }
+
+ public final int getLHS() { return this.lhs; }
+
+
+ public void setEnglish(int[] eng) {
+ //TODO: do nothing
+ }
+
+ public int[] getEnglish() {
+ //TODO
+ return null;
+ }
+
+
+ public final void setFrench(int[] french) { this.pFrench = french; }
+
+ public final int[] getFrench() { return this.pFrench; }
+
+
+ public final void setFeatureScores(float[] scores) {
+ this.featScores = scores;
+ }
+
+ public final float[] getFeatureScores() {
+ return this.featScores;
+ }
+
+
+ public final void setLatticeCost(float cost) { this.latticeCost = cost; }
+
+ public final float getLatticeCost() { return this.latticeCost; }
+
+
+ public final float getEstCost() {
+ if (est_cost <= Double.NEGATIVE_INFINITY) {
+ logger.warning("The est cost is neg infinity; must be bad rule; rule is:\n" + toString());
+ }
+ return est_cost;
+ }
+
+
+ /**
+ * Set a lower-bound estimate inside the rule returns full
+ * estimate.
+ */
+ public final float estimateRuleCost(List<FeatureFunction> featureFunctions) {
+ if (null == featureFunctions) {
+ return 0;
+ } else {
+ float estcost = 0.0f;
+ for (FeatureFunction ff : featureFunctions) {
+ double mdcost = - ff.estimateLogP(this, -1) * ff.getWeight();
+ estcost += mdcost;
+ }
+
+ this.est_cost = estcost;
+ return estcost;
+ }
+ }
+
+ //===============================================================
+ // Methods
+ //===============================================================
+
+ public float incrementFeatureScore(int column, double score) {
+ synchronized(this) {
+ featScores[column] += score;
+ return featScores[column];
+ }
+ }
+
+
+ public void setFeatureCost(int column, float score) {
+ synchronized(this) {
+ featScores[column] = score;
+ }
+ }
+
+
+ public float getFeatureCost(int column) {
+ synchronized(this) {
+ return featScores[column];
+ }
+ }
+
+ //===============================================================
+ // Serialization Methods
+ //===============================================================
+ // BUG: These are all far too redundant. Should be refactored to share.
+
+ // Caching this method significantly improves performance
+ // We mark it transient because it is, though cf
+ // java.io.Serializable
+ private transient String cachedToString = null;
+
+ @Deprecated
+ public String toString(Map<Integer,String> ntVocab, SymbolTable sourceVocab, SymbolTable targetVocab) {
+ if (null == this.cachedToString) {
+ StringBuffer sb = new StringBuffer();
+ sb.append(ntVocab.get(this.lhs));
+ sb.append(" ||| ");
+ sb.append(sourceVocab.getWords(this.pFrench,true));
+ sb.append(" |||");
+ for (int i = 0; i < this.featScores.length; i++) {
+ //sb.append(String.format(" %.4f", this.feat_scores[i]));
+ sb.append(' ').append(Float.toString(this.featScores[i]));
+ }
+ this.cachedToString = sb.toString();
+ }
+ return this.cachedToString;
+ }
+
+
+ //print the rule in terms of Ingeters
+ @Deprecated
+ public String toString() {
+ if (null == this.cachedToString) {
+ StringBuffer sb = new StringBuffer();
+ sb.append(this.lhs);
+ sb.append(" ||| ");
+ sb.append(Arrays.toString(this.pFrench));
+ sb.append(" |||");
+ for (int i = 0; i < this.featScores.length; i++) {
+ sb.append(String.format(" %.4f", this.featScores[i]));
+ }
+ this.cachedToString = sb.toString();
+ }
+ return this.cachedToString;
+ }
+
+
+ //do not use cachedToString
+ @Deprecated
+ public String toString(SymbolTable symbolTable) {
+ StringBuffer sb = new StringBuffer();
+ sb.append(symbolTable.getWord(this.lhs));
+ sb.append(" ||| ");
+ sb.append(symbolTable.getWords(this.pFrench));
+ sb.append(" |||");
+ for (int i = 0; i < this.featScores.length; i++) {
+ sb.append(String.format(" %.4f", this.featScores[i]));
+ }
+ return sb.toString();
+ }
+
+
+ @Deprecated
+ public String toStringWithoutFeatScores(SymbolTable symbolTable) {
+ StringBuffer sb = new StringBuffer();
+ if(symbolTable==null)
+ sb.append(this.getLHS());
+ else
+ sb.append(symbolTable.getWord(this.getLHS()));
+
+ return sb.append(" ||| ")
+ .append(convertToString(this.getFrench(), symbolTable))
+ .toString();
+ }
+
+ public String convertToString(int[] words, SymbolTable symbolTable){
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < words.length; i++) {
+ if(symbolTable!=null)
+ sb.append( symbolTable.getWord(words[i]) );
+ else
+ sb.append(words[i]);
+
+ if(i<words.length-1)
+ sb.append(" ");
+ }
+ return sb.toString();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ab5bb42c/src/main/java/org/apache/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/lattice/Lattice.java b/src/main/java/org/apache/joshua/lattice/Lattice.java
index 98938d8..1adefa8 100644
--- a/src/main/java/org/apache/joshua/lattice/Lattice.java
+++ b/src/main/java/org/apache/joshua/lattice/Lattice.java
@@ -25,6 +25,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Stack;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -62,7 +63,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
/** Logger for this class. */
private static final Logger logger = Logger.getLogger(Lattice.class.getName());
-
+
JoshuaConfiguration config = null;
/**
@@ -75,7 +76,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
*/
public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
this.nodes = nodes;
-// this.distances = calculateAllPairsShortestPath();
+ // this.distances = calculateAllPairsShortestPath();
this.latticeHasAmbiguity = true;
}
@@ -83,7 +84,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
// Node<Value> sink = new Node<Value>(nodes.size());
// nodes.add(sink);
this.nodes = nodes;
-// this.distances = calculateAllPairsShortestPath();
+ // this.distances = calculateAllPairsShortestPath();
this.latticeHasAmbiguity = isAmbiguous;
}
@@ -114,7 +115,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
i++;
}
-// this.distances = calculateAllPairsShortestPath();
+ // this.distances = calculateAllPairsShortestPath();
}
public final boolean hasMoreThanOnePath() {
@@ -155,7 +156,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
-
+
// This matches a sequence of tuples, which describe arcs leaving this node
Pattern nodePattern = Pattern.compile("(.+?)\\(\\s*(\\(.+?\\),\\s*)\\s*\\)(.*)");
@@ -320,7 +321,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
// System.err.println(String.format("DISTANCE(%d,%d) = %f", from, to, costs[from][to]));
if (distances == null)
this.distances = calculateAllPairsShortestPath();
-
+
return distances.get(from, to);
}
@@ -448,22 +449,22 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
* @param lattice
*/
public void insert(int i, int j, List<Node<Value>> newNodes) {
-
+
nodes.get(i).setOutgoingArcs(newNodes.get(0).getOutgoingArcs());
-
+
newNodes.remove(0);
nodes.remove(j);
Collections.reverse(newNodes);
-
+
for (Node<Value> node: newNodes)
nodes.add(j, node);
-
+
this.latticeHasAmbiguity = false;
for (int x = 0; x < nodes.size(); x++) {
nodes.get(x).setID(x);
this.latticeHasAmbiguity |= (nodes.get(x).getOutgoingArcs().size() > 1);
}
-
+
this.distances = null;
}
@@ -481,35 +482,104 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
ArrayList<Arc<Value>> arcs = new ArrayList<Arc<Value>>();
for (Arc<Value> arc: node.getOutgoingArcs()) {
arcs.add(arc);
-
+
if (! ingraph.containsKey(arc.getHead()))
ingraph.put(arc.getHead(), new ArrayList<Arc<Value>>());
ingraph.get(arc.getHead()).add(arc);
-
+
outgraph.put(node, arcs);
}
}
-
+
ArrayList<Node<Value>> sortedNodes = new ArrayList<Node<Value>>();
Stack<Node<Value>> stack = new Stack<Node<Value>>();
stack.push(nodes.get(0));
-
+
while (! stack.empty()) {
Node<Value> node = stack.pop();
sortedNodes.add(node);
for (Arc<Value> arc: outgraph.get(node)) {
outgraph.get(node).remove(arc);
ingraph.get(arc.getHead()).remove(arc);
-
+
if (ingraph.get(arc.getHead()).size() == 0)
sortedNodes.add(arc.getHead());
}
}
-
+
int id = 0;
for (Node<Value> node : sortedNodes)
node.setID(id++);
-
+
this.nodes = sortedNodes;
}
+
+ /**
+ * Constructs a lattice from a given string representation.
+ *
+ * @param data String representation of a lattice.
+ * @return A lattice that corresponds to the given string.
+ */
+ public static Lattice<String> createFromString(String data) {
+
+ Map<Integer,Node<String>> nodes = new HashMap<Integer,Node<String>>();
+
+ Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
+ Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
+
+ Matcher nodeMatcher = nodePattern.matcher(data);
+
+ int nodeID = -1;
+
+ while (nodeMatcher.matches()) {
+
+ String nodeData = nodeMatcher.group(2);
+ String remainingData = nodeMatcher.group(3);
+
+ nodeID++;
+
+ Node<String> currentNode;
+ if (nodes.containsKey(nodeID)) {
+ currentNode = nodes.get(nodeID);
+ } else {
+ currentNode = new Node<String>(nodeID);
+ nodes.put(nodeID, currentNode);
+ }
+
+ if (logger.isLoggable(Level.FINE)) logger.fine("Node " + nodeID + ":");
+
+ Matcher arcMatcher = arcPattern.matcher(nodeData);
+
+ while (arcMatcher.matches()) {
+ String arcLabel = arcMatcher.group(1);
+ double arcWeight = Double.valueOf(arcMatcher.group(2));
+ int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3));
+
+ Node<String> destinationNode;
+ if (nodes.containsKey(destinationNodeID)) {
+ destinationNode = nodes.get(destinationNodeID);
+ } else {
+ destinationNode = new Node<String>(destinationNodeID);
+ nodes.put(destinationNodeID, destinationNode);
+ }
+
+ String remainingArcs = arcMatcher.group(4);
+
+ if (logger.isLoggable(Level.FINE)) logger.fine("\t" + arcLabel + " " + arcWeight + " " + destinationNodeID);
+
+ currentNode.addArc(destinationNode, (float) arcWeight, arcLabel);
+
+ arcMatcher = arcPattern.matcher(remainingArcs);
+ }
+
+ nodeMatcher = nodePattern.matcher(remainingData);
+ }
+
+ List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
+ Collections.sort(nodeList, new NodeIdentifierComparator());
+
+ if (logger.isLoggable(Level.FINE)) logger.fine(nodeList.toString());
+
+ return new Lattice<String>(nodeList, new JoshuaConfiguration());
+ }
}