You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/17 12:27:54 UTC
[05/14] incubator-joshua git commit: Joshua 7 configuration system
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
index 5332135..412cf60 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
@@ -27,8 +27,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Stack;
-import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.Accumulator;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.StatefulFF;
import org.apache.joshua.decoder.ff.state_maintenance.DPState;
@@ -38,6 +38,8 @@ import org.apache.joshua.decoder.segment_file.Sentence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.typesafe.config.Config;
+
/**
* <p>Feature function that reads in a list of language model fragments and matches them against the
* hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments
@@ -106,15 +108,15 @@ public class FragmentLMFF extends StatefulFF {
* @param args arguments passed to the feature function
* @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration}
*/
- public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "FragmentLMFF", args, config);
+ public FragmentLMFF(Config featureConfig, FeatureVector weights) {
+ super("FragmentLMFF", featureConfig, weights);
lmFragments = new HashMap<>();
- fragmentLMFile = parsedArgs.get("lm");
- BUILD_DEPTH = Integer.parseInt(parsedArgs.get("build-depth"));
- MAX_DEPTH = Integer.parseInt(parsedArgs.get("max-depth"));
- MIN_LEX_DEPTH = Integer.parseInt(parsedArgs.get("min-lex-depth"));
+ fragmentLMFile = featureConfig.getString("lm");
+ BUILD_DEPTH = featureConfig.getInt("build-depth");
+ MAX_DEPTH = featureConfig.getInt("max-depth");
+ MIN_LEX_DEPTH = featureConfig.getInt("min-lex-depth");
/* Read in the language model fragments */
try {
@@ -169,7 +171,7 @@ public class FragmentLMFF extends StatefulFF {
* @param j todo
* @param sourcePath information about a path taken through the source {@link org.apache.joshua.lattice.Lattice}
* @param sentence {@link org.apache.joshua.lattice.Lattice} input
- * @param acc {@link org.apache.joshua.decoder.ff.FeatureFunction.Accumulator} object permitting generalization of feature computation
+ * @param acc {@link org.apache.joshua.decoder.ff.Accumulator} object permitting generalization of feature computation
* @return the new dynamic programming state (null for stateless features)
*/
@Override
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
index 0b522cb..e30242e 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -27,9 +27,10 @@ import java.util.LinkedList;
import java.util.List;
import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Decoder;
import org.apache.joshua.decoder.Support;
import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.Accumulator;
import org.apache.joshua.decoder.ff.FeatureMap;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.StatefulFF;
@@ -45,6 +46,7 @@ import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.primitives.Ints;
+import com.typesafe.config.Config;
/**
* This class performs the following:
@@ -95,8 +97,9 @@ public class LanguageModelFF extends StatefulFF {
/**
* We cache the weight of the feature since there is only one.
*/
- protected String type;
+ protected final String type;
protected final String path;
+ protected final boolean useSourceAnnotations;
/** Whether this is a class-based LM */
protected boolean isClassLM;
@@ -105,21 +108,23 @@ public class LanguageModelFF extends StatefulFF {
/** Whether this feature function fires LM oov indicators */
protected boolean withOovFeature;
- public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, NAME_PREFIX + LM_INDEX, args, config);
+ public LanguageModelFF(Config featureConfig, FeatureVector weights) {
+ super(NAME_PREFIX + LM_INDEX, featureConfig, weights);
this.oovFeatureId = FeatureMap.hashFeature(NAME_PREFIX + LM_INDEX + OOV_SUFFIX);
LM_INDEX++;
- this.type = parsedArgs.get("lm_type");
- this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
- this.path = parsedArgs.get("lm_file");
+ this.type = featureConfig.getString("lm_type");
+ this.ngramOrder = featureConfig.getInt("lm_order");
+ this.path = featureConfig.getString("lm_file");
+ this.useSourceAnnotations = featureConfig.hasPath("source_annotations") ?
+ featureConfig.getBoolean("source_annotations") : false;
- if (parsedArgs.containsKey("class_map")) {
+ if (featureConfig.hasPath("class_map")) {
this.isClassLM = true;
- this.classMap = new ClassMap(parsedArgs.get("class_map"));
+ this.classMap = new ClassMap(featureConfig.getString("class_map"));
}
- if (parsedArgs.containsKey("oov_feature")) {
+ if (featureConfig.hasPath("oov_feature")) {
this.withOovFeature = true;
}
@@ -146,7 +151,9 @@ public class LanguageModelFF extends StatefulFF {
}
Vocabulary.registerLanguageModel(this.languageModel);
- Vocabulary.id(config.default_non_terminal);
+ // TODO(fhieber): this should not be here really, but it works like this.
+ final String defaultNonTerminal = Decoder.getDefaultFlags().getString("default_non_terminal");
+ Vocabulary.id(defaultNonTerminal);
startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
}
@@ -177,7 +184,7 @@ public class LanguageModelFF extends StatefulFF {
}
int[] words;
- if (config.source_annotations) {
+ if (useSourceAnnotations) {
// get source side annotations and project them to the target side
words = getTags(rule, i, j, sentence);
} else {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 8e54a2d..2be8b0f 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@ -24,9 +24,10 @@ import java.util.List;
import java.util.UUID;
import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Decoder;
import org.apache.joshua.decoder.KenLMPool;
import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.Accumulator;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair;
import org.apache.joshua.decoder.ff.state_maintenance.DPState;
@@ -35,6 +36,8 @@ import org.apache.joshua.decoder.ff.tm.Rule;
import org.apache.joshua.decoder.hypergraph.HGNode;
import org.apache.joshua.decoder.segment_file.Sentence;
+import com.typesafe.config.Config;
+
/**
* Wrapper for KenLM LMs with left-state minimization. We inherit from the regular
*
@@ -43,13 +46,10 @@ import org.apache.joshua.decoder.segment_file.Sentence;
*/
public class StateMinimizingLanguageModel extends LanguageModelFF {
- public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, args, config);
- this.type = "kenlm";
- if (parsedArgs.containsKey("lm_type") && ! parsedArgs.get("lm_type").equals("kenlm")) {
- String msg = "* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend"
- + "* Remove lm_type from line or set to 'kenlm'";
- throw new RuntimeException(msg);
+ public StateMinimizingLanguageModel(Config featureConfig, FeatureVector weights) {
+ super(featureConfig, weights);
+ if (!featureConfig.getString("lm_type").equals("kenlm")) {
+ throw new RuntimeException("StateMinimizingLanguageModel only supports 'lm_type = kenlm'");
}
}
@@ -63,7 +63,9 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
this.languageModel = new KenLM(ngramOrder, path);
Vocabulary.registerLanguageModel(this.languageModel);
- Vocabulary.id(config.default_non_terminal);
+ // TODO(fhieber): this should not be here really, but it works like this.
+ final String defaultNonTerminal = Decoder.getDefaultFlags().getString("default_non_terminal");
+ Vocabulary.id(defaultNonTerminal);
}
@@ -100,7 +102,7 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
}
int[] ruleWords;
- if (config.source_annotations) {
+ if (useSourceAnnotations) {
// get source side annotations and project them to the target side
ruleWords = getTags(rule, i, j, sentence);
} else {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
index 4309820..a2b5209 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
@@ -20,8 +20,8 @@ package org.apache.joshua.decoder.ff.phrase;
import java.util.List;
-import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.Accumulator;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.StatelessFF;
import org.apache.joshua.decoder.ff.state_maintenance.DPState;
@@ -30,16 +30,12 @@ import org.apache.joshua.decoder.hypergraph.HGNode;
import org.apache.joshua.decoder.phrase.Hypothesis;
import org.apache.joshua.decoder.segment_file.Sentence;
+import com.typesafe.config.Config;
+
public class Distortion extends StatelessFF {
- public Distortion(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "Distortion", args, config);
-
- if (! config.search_algorithm.equals("stack")) {
- String msg = "* FATAL: Distortion feature only application for phrase-based decoding. "
- + "Use -search phrase or remove this feature";
- throw new RuntimeException(msg);
- }
+ public Distortion(Config featureConfig, FeatureVector weights) {
+ super("Distortion", featureConfig, weights);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
index 91cf00f..f6f48cf 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
@@ -19,21 +19,14 @@
package org.apache.joshua.decoder.ff.tm;
import java.util.Arrays;
-import java.util.HashSet;
import java.util.List;
-import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.decoder.ff.FeatureFunction;
-import org.apache.joshua.decoder.phrase.PhraseTable;
-import org.apache.joshua.decoder.segment_file.Token;
-import org.apache.joshua.lattice.Arc;
-import org.apache.joshua.lattice.Lattice;
-import org.apache.joshua.lattice.Node;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.typesafe.config.Config;
+
/**
* Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
* grammar.
@@ -89,21 +82,22 @@ public abstract class AbstractGrammar implements Grammar {
/* The maximum span of the input this grammar rules can be applied to. */
protected final int spanLimit;
- protected final JoshuaConfiguration joshuaConfiguration;
+ protected final Config config;
/**
- * Creates an empty, unsorted grammar with given owner and spanlimit
+ * Creates an empty, unsorted grammar with
+ * owner and spanLimit configured. The Grammar is initially not sorted.
*
* @see Grammar#isSorted()
* @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
* @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
* @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
*/
- public AbstractGrammar(final String owner, final JoshuaConfiguration config, final int spanLimit) {
+ public AbstractGrammar(final Config config) {
+ this.config = config;
+ this.owner = OwnerMap.register(config.getString("owner"));
+ this.spanLimit = config.getInt("span_limit");
this.sorted = false;
- this.owner = OwnerMap.register(owner);
- this.joshuaConfiguration = config;
- this.spanLimit = spanLimit;
}
public static final int OOV_RULE_ID = 0;
@@ -183,48 +177,4 @@ public abstract class AbstractGrammar implements Grammar {
}
}
}
-
- // write grammar to disk
- public void writeGrammarOnDisk(String file) {
- }
-
- /**
- * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
- * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
- *
- * @param grammar Grammar in the Trie
- * @param inputLattice the lattice representing the input sentence
- * @param featureFunctions a list of feature functions used for scoring
- * @param onlyTrue determine if word is actual OOV.
- */
- public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice,
- List<FeatureFunction> featureFunctions, boolean onlyTrue) {
- /*
- * Add OOV rules; This should be called after the manual constraints have
- * been set up.
- */
- HashSet<Integer> words = new HashSet<>();
- for (Node<Token> node : inputLattice) {
- for (Arc<Token> arc : node.getOutgoingArcs()) {
- // create a rule, but do not add into the grammar trie
- // TODO: which grammar should we use to create an OOV rule?
- int sourceWord = arc.getLabel().getWord();
- if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
- || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
- continue;
-
- // Determine if word is actual OOV.
- if (onlyTrue && ! Vocabulary.hasId(sourceWord))
- continue;
-
- words.add(sourceWord);
- }
- }
-
- for (int sourceWord: words)
- grammar.addOOVRules(sourceWord, featureFunctions);
-
- // Sort all the rules (not much to actually do, this just marks it as sorted)
- grammar.sortGrammar(featureFunctions);
- }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
index e8242f6..46e4bb9 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
@@ -28,7 +28,6 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.util.io.LineReader;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
@@ -47,7 +46,7 @@ public class CreateGlueGrammar {
private String grammarPath;
@Option(name = "--goal", aliases = {"-goal"}, required = false, usage = "specify custom GOAL symbol. Default: 'GOAL'")
- private final String goalSymbol = cleanNonTerminal(new JoshuaConfiguration().goal_symbol);
+ private final String goalSymbol = "GOAL";
/* Rule templates */
// [GOAL] ||| <s> ||| <s> ||| 0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
index 8497c17..67271de 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@ -20,6 +20,7 @@ package org.apache.joshua.decoder.ff.tm;
import java.util.List;
+import org.apache.joshua.decoder.DecoderConfig;
import org.apache.joshua.decoder.ff.FeatureFunction;
/**
@@ -98,11 +99,8 @@ public interface Grammar {
/**
* Add an OOV rule for the requested word for the grammar.
- *
- * @param word input word to add rules to
- * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
*/
- void addOOVRules(int word, List<FeatureFunction> featureFunctions);
+ void addOOVRules(int word, DecoderConfig config);
/**
* Add a rule to the grammar.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
index 70e786c..4d6f483 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
@@ -21,7 +21,6 @@ package org.apache.joshua.decoder.ff.tm;
import java.io.IOException;
import java.util.Iterator;
-import org.apache.joshua.decoder.Decoder;
import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
import org.apache.joshua.util.io.LineReader;
@@ -136,7 +135,7 @@ public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iter
advanceReader();
- if (Decoder.VERBOSE >= 1) {
+ if (true) {
int newProgress = (reader != null) ? reader.progress() : 100;
//TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %!
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
deleted file mode 100644
index 4f545b7..0000000
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map.Entry;
-
-import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
-import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import org.apache.joshua.decoder.segment_file.Sentence;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
- * trie, a subset of the original trie, that only contains trie paths that are reachable from
- * traversals of the current sentence.
- *
- * @author Matt Post post@cs.jhu.edu
- */
-public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
-
- private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
-
- private final AbstractGrammar baseGrammar;
- private final SentenceFilteredTrie filteredTrie;
- private final int[] tokens;
- private final Sentence sentence;
-
- /**
- * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
- * from the base grammar, which contains the complete grammar).
- *
- * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
- * @param sentence {@link org.apache.joshua.lattice.Lattice} input
- */
- SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
- super(OwnerMap.getOwner(baseGrammar.getOwner()), baseGrammar.joshuaConfiguration, baseGrammar.getSpanLimit());
- this.baseGrammar = baseGrammar;
- this.sentence = sentence;
- this.tokens = sentence.getWordIDs();
-
- int origCount = getNumRules(baseGrammar.getTrieRoot());
- long startTime = System.currentTimeMillis();
-
- /* Filter the rules; returns non-null object */
- this.filteredTrie = filter(baseGrammar.getTrieRoot());
- int filteredCount = getNumRules();
-
- float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
-
- LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
- sentence.id(), origCount, filteredCount, seconds);
- }
-
- @Override
- public Trie getTrieRoot() {
- return filteredTrie;
- }
-
- /**
- * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
- * current span, but whether the grammar is permitted to apply rules to the current span (a
- * grammar-level parameter). As such we can just chain to the underlying grammar.
- */
- @Override
- public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
- return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
- }
-
- @Override
- public int getNumRules() {
- return getNumRules(getTrieRoot());
- }
-
- /**
- * A convenience function that counts the number of rules in a grammar's trie.
- *
- * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
- * @return the number of rules
- */
- public int getNumRules(Trie node) {
- int numRules = 0;
- if (node != null) {
- if (node.getRuleCollection() != null)
- numRules += node.getRuleCollection().getRules().size();
-
- if (node.getExtensions() != null)
- for (Trie child : node.getExtensions())
- numRules += getNumRules(child);
- }
-
- return numRules;
- }
-
- /**
- * What is the algorithm?
- *
- * Take the first word of the sentence, and start at the root of the trie. There are two things to
- * consider: (a) word matches and (b) nonterminal matches.
- *
- * For a word match, simply follow that arc along the trie. We create a parallel arc in our
- * filtered grammar to represent it. Each arc in the filtered trie knows about its
- * corresponding/underlying node in the unfiltered grammar trie.
- *
- * A nonterminal is always permitted to match. The question then is how much of the input sentence
- * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
- * has to be a set of calls, one each to the next trie node with different lengths of the sentence
- * remaining.
- *
- * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
- * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
- * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
- * subsequence, because with respect to filtering rules, they are all the same.
- *
- * We accomplish this with the following restriction: for purposes of grammar filtering, only the
- * first in a sequence of nonterminal traversals can consume more than one word. Each of the
- * subsequent ones would have to consume just one word. We then just have to record in the
- * recursive call whether the last traversal was a nonterminal or not.
- *
- * @param unfilteredTrieRoot todo
- * @return the root of the filtered trie
- */
- private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
- SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
-
- // System.err.println(String.format("FILTERING TO SENTENCE\n %s\n",
- // Vocabulary.getWords(tokens)));
-
- /*
- * The root of the trie is where rule applications start, so we simply try all possible
- * positions in the sentence.
- */
- for (int i = 0; i < tokens.length; i++) {
- filter(i, filteredTrieRoot, false);
- }
-
- return filteredTrieRoot;
- }
-
- /**
- * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
- * Marks arcs that are traversable for this sentence.
- *
- * @param i the position in the sentence to start matching
- * @param trie the trie node to match against
- * @param lastWasNT true if the match that brought us here was against a nonterminal
- */
- private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
- if (i >= tokens.length)
- return;
-
- /* Make sure the underlying unfiltered node has children. */
- Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
- if (unfilteredTrieNode.getChildren() == null) {
- // trieNode.path.retreat();
- return;
- }
-
- /* Match a word */
- Trie trie = unfilteredTrieNode.match(tokens[i]);
- if (trie != null) {
- /*
- * The current filtered node might already have an arc for this label. If so, retrieve it
- * (since we still need to follow it); if not, create it.
- */
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(trie);
- trieNode.children.put(tokens[i], nextFilteredTrie);
- }
-
- /*
- * Now continue, trying to match the child node against the next position in the sentence. The
- * third argument records that this match was not against a nonterminal.
- */
- filter(i + 1, nextFilteredTrie, false);
- }
-
- /*
- * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
- * sentence, up to the maximum span for that grammar. So we enumerate all children of the
- * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
- * less than 0), then recurse.
- *
- * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
- * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
- * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
- * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
- * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
- */
- HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
- if (children != null) {
- for (int label : children.keySet()) {
- if (label < 0) {
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
- trieNode.children.put(label, nextFilteredTrie);
- }
-
- /*
- * Recurse. If the last match was a nonterminal, we can only consume one more token.
- *
- * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
- * span limit which should be consulted. What we should be doing is passing the point
- * where we started matching the current sentence, so we can apply this span limit, which
- * is easily accessible (baseGrammar.spanLimit).
- */
- int maxJ = lastWasNT ? (i + 1) : tokens.length;
- for (int j = i + 1; j <= maxJ; j++) {
- filter(j, nextFilteredTrie, true);
- }
- }
- }
- }
- }
-
- /**
- * Alternate filter that uses regular expressions, walking the grammar trie and matching the
- * source side of each rule collection against the input sentence. Failed matches are discarded,
- * and trie nodes extending from that position need not be explored.
- *
- * @param unfilteredTrie todo
- * @return the root of the filtered trie if any rules were retained, otherwise null
- */
- @SuppressWarnings("unused")
- private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
- SentenceFilteredTrie trie = null;
-
- /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
- if (unfilteredTrie.hasRules())
- if (matchesSentence(unfilteredTrie))
- trie = new SentenceFilteredTrie(unfilteredTrie);
- else
- return null;
-
- /* Case 2: keep the trie node if it has children who have valid rule collections */
- if (unfilteredTrie.hasExtensions())
- for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
- Trie unfilteredChildTrie = arc.getValue();
- SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
- if (nextTrie != null) {
- if (trie == null)
- trie = new SentenceFilteredTrie(unfilteredTrie);
- trie.children.put(arc.getKey(), nextTrie);
- }
- }
-
- return trie;
- }
-
- private boolean matchesSentence(Trie childTrie) {
- Rule rule = childTrie.getRuleCollection().getRules().get(0);
- return rule.matches(sentence);
- }
-
- /**
- * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
- * the given input sentence.
- *
- * @author Matt Post post@cs.jhu.edu
- *
- */
- public class SentenceFilteredTrie implements Trie {
-
- /* The underlying unfiltered trie node. */
- private final Trie unfilteredTrieNode;
-
- /* The child nodes in the filtered trie. */
- private HashMap<Integer, SentenceFilteredTrie> children = null;
-
- /**
- * Constructor.
- *
- * @param unfilteredTrieNode todo
- */
- public SentenceFilteredTrie(Trie unfilteredTrieNode) {
- this.unfilteredTrieNode = unfilteredTrieNode;
- this.children = new HashMap<>();
- }
-
- @Override
- public SentenceFilteredTrie match(int wordID) {
- if (children != null)
- return children.get(wordID);
- return null;
- }
-
- @Override
- public boolean hasExtensions() {
- return children != null;
- }
-
- @Override
- public Collection<SentenceFilteredTrie> getExtensions() {
- if (children != null)
- return children.values();
-
- return null;
- }
-
- @Override
- public HashMap<Integer, SentenceFilteredTrie> getChildren() {
- return children;
- }
-
- @Override
- public boolean hasRules() {
- // Chain to the underlying unfiltered node.
- return unfilteredTrieNode.hasRules();
- }
-
- @Override
- public RuleCollection getRuleCollection() {
- // Chain to the underlying unfiltered node, since the rule collection just varies by target
- // side.
- return unfilteredTrieNode.getRuleCollection();
- }
-
- /**
- * Counts the number of rules.
- *
- * @return the number of rules rooted at this node.
- */
- public int getNumRules() {
- int numRules = 0;
- if (getTrieRoot() != null)
- if (getTrieRoot().getRuleCollection() != null)
- numRules += getTrieRoot().getRuleCollection().getRules().size();
-
- for (SentenceFilteredTrie node : getExtensions())
- numRules += node.getNumRules();
-
- return numRules;
- }
-
- @Override
- public Iterator<Integer> getTerminalExtensionIterator() {
- return new ExtensionIterator(children, true);
- }
-
- @Override
- public Iterator<Integer> getNonterminalExtensionIterator() {
- return new ExtensionIterator(children, false);
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 4b549fb..9c6f386 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -25,7 +25,6 @@ import org.apache.joshua.decoder.ff.tm.OwnerId;
import org.apache.joshua.decoder.ff.tm.OwnerMap;
import org.apache.joshua.decoder.ff.tm.Rule;
import org.apache.joshua.util.Constants;
-import org.apache.joshua.util.FormatUtils;
import org.apache.joshua.util.io.LineReader;
/***
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
deleted file mode 100644
index 92566da..0000000
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.joshua.decoder.ff.tm.hash_based;
-
-import static org.apache.joshua.decoder.ff.tm.GrammarReader.createReader;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
-import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
-import org.apache.joshua.decoder.ff.FeatureFunction;
-import org.apache.joshua.decoder.ff.FeatureVector;
-import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
-import org.apache.joshua.decoder.ff.tm.GrammarReader;
-import org.apache.joshua.decoder.ff.tm.Rule;
-import org.apache.joshua.decoder.ff.tm.Trie;
-import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
-import org.apache.joshua.util.FormatUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class implements a memory-based bilingual BatchGrammar.
- * <p>
- * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
- * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
- * in HashMap
- *
- * @author Zhifei Li zhifei.work@gmail.com
- * @author Matt Post post@cs.jhu.edu
- */
-public class MemoryBasedBatchGrammar extends AbstractGrammar {
-
- private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
-
- /* The number of rules read. */
- private int qtyRulesRead = 0;
-
- /* The number of distinct source sides. */
- private int qtyRuleBins = 0;
-
- /* The trie root. */
- private final MemoryBasedTrie root = new MemoryBasedTrie();
-
- /* The file containing the grammar. */
- private String grammarFile;
-
- /**
- * Constructor used by Decoder mostly. Default spanLimit of 20
- * @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
- * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
- * @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
- */
- public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration config, int spanLimit) {
- super(owner, config, spanLimit);
- }
-
- public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
- String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
- throws IOException {
-
- super(owner, joshuaConfiguration, spanLimit);
- Vocabulary.id(defaultLHSSymbol);
- this.grammarFile = grammarFile;
-
- // ==== loading grammar
- try(GrammarReader<Rule> reader = createReader(formatKeyword, grammarFile, getOwner());) {
- for (Rule rule : reader) {
- if (rule != null) {
- addRule(rule);
- }
- }
- }
-
- this.printGrammar();
- }
-
- // ===============================================================
- // Methods
- // ===============================================================
-
- @Override
- public int getNumRules() {
- return this.qtyRulesRead;
- }
-
- /**
- * if the span covered by the chart bin is greater than the limit, then return false
- */
- public boolean hasRuleForSpan(int i, int j, int pathLength) {
- if (this.spanLimit == -1) { // mono-glue grammar
- return (i == 0);
- } else {
- // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
- // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
- return (pathLength <= this.spanLimit);
- }
- }
-
- public Trie getTrieRoot() {
- return this.root;
- }
-
- /**
- * Adds a rule to the grammar.
- */
- public void addRule(Rule rule) {
-
- this.qtyRulesRead++;
-
- // === identify the position, and insert the trie nodes as necessary
- MemoryBasedTrie pos = root;
- int[] french = rule.getSource();
-
- maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
-
- for (int curSymID : french) {
- /*
- * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
- * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
- * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
- * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
- * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
- */
-
- MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
- if (null == nextLayer) {
- nextLayer = new MemoryBasedTrie();
- if (pos.hasExtensions() == false) {
- pos.childrenTbl = new HashMap<>();
- }
- pos.childrenTbl.put(curSymID, nextLayer);
- }
- pos = nextLayer;
- }
-
- // === add the rule into the trie node
- if (!pos.hasRules()) {
- pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getSource());
- this.qtyRuleBins++;
- }
- pos.ruleBin.addRule(rule);
- }
-
- protected void printGrammar() {
- LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
- this.qtyRulesRead, this.qtyRuleBins, grammarFile);
- }
-
- /***
- * Takes an input word and creates an OOV rule in the current grammar for that word.
- *
- * @param sourceWord integer representation of word
- * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
- */
- @Override
- public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
-
- // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
- // almost
- // certainly is)
- final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
- .word(sourceWord) + "_OOV") : sourceWord;
-
- final int[] sourceWords = { sourceWord };
- final int[] targetWords = { targetWord };
- final byte[] alignment = { 0, 0 };
- final FeatureVector features = new FeatureVector(0);
-
- if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
-
- for (OOVItem item : this.joshuaConfiguration.oovList) {
- final Rule oovRule = new Rule(
- Vocabulary.id(item.label),
- sourceWords,
- targetWords,
- 0,
- features,
- alignment,
- getOwner());
- addRule(oovRule);
- oovRule.estimateRuleCost(featureFunctions);
- }
-
- } else {
-
- final Rule oovRule = new Rule(
- Vocabulary.id(this.joshuaConfiguration.default_non_terminal),
- sourceWords,
- targetWords,
- 0,
- features,
- alignment,
- getOwner());
- addRule(oovRule);
- oovRule.estimateRuleCost(featureFunctions);
-
- }
- }
-
- /**
- * Adds a default set of glue rules.
- *
- * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
- */
- public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
- final HieroFormatReader reader = new HieroFormatReader(getOwner());
-
- String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
- String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
-
- String[] ruleStrings = new String[] {
- String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
- Vocabulary.START_SYM),
- String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
- goalNT, defaultNT),
- String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
- Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
-
- for (String ruleString : ruleStrings) {
- Rule rule = reader.parseLine(ruleString);
- addRule(rule);
- rule.estimateRuleCost(featureFunctions);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammar.java
new file mode 100644
index 0000000..5923965
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammar.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.DecoderConfig;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Throwables;
+import com.typesafe.config.Config;
+
+/**
+ * This class implements a memory-based bilingual BatchGrammar.
+ * <p>
+ * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+ * in HashMap
+ *
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class TextGrammar extends AbstractGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(TextGrammar.class);
+
+ /* The number of rules read. */
+ private int qtyRulesRead = 0;
+
+ /* The number of distinct source sides. */
+ private int qtyRuleBins = 0;
+
+ /* The trie root. */
+ private final MemoryBasedTrie root = new MemoryBasedTrie();
+
+ /* The path containing the grammar. */
+ private final Optional<String> path;
+
+ public TextGrammar(final Config config) {
+ super(config);
+ this.path = config.hasPath("path") ? Optional.of(config.getString("path")) : Optional.empty();
+
+ // if path is configured, actually load the grammar
+ if (this.path.isPresent()) {
+ this.loadGrammar(this.path.get());
+ this.printGrammar();
+ }
+ }
+
+ private void loadGrammar(final String path) {
+ try(final HieroFormatReader reader = new HieroFormatReader(path, getOwner());) {
+ for (Rule rule : reader) {
+ if (rule != null) {
+ addRule(rule);
+ }
+ }
+ } catch (IOException e) {
+ Throwables.propagate(e);
+ }
+ }
+
+ @Override
+ public int getNumRules() {
+ return this.qtyRulesRead;
+ }
+
+ /**
+ * if the span covered by the chart bin is greater than the limit, then return false
+ */
+ public boolean hasRuleForSpan(int i, int j, int pathLength) {
+ if (this.spanLimit == -1) { // mono-glue grammar
+ return (i == 0);
+ } else {
+ // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+ // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+ return (pathLength <= this.spanLimit);
+ }
+ }
+
+ public Trie getTrieRoot() {
+ return this.root;
+ }
+
+ /**
+ * Adds a rule to the grammar.
+ */
+ public void addRule(Rule rule) {
+
+ this.qtyRulesRead++;
+
+ // === identify the position, and insert the trie nodes as necessary
+ MemoryBasedTrie pos = root;
+ int[] french = rule.getSource();
+
+ maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+
+ for (int curSymID : french) {
+ /*
+ * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+ * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+ * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+ * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+ * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+ */
+
+ MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+ if (null == nextLayer) {
+ nextLayer = new MemoryBasedTrie();
+ if (pos.hasExtensions() == false) {
+ pos.childrenTbl = new HashMap<>();
+ }
+ pos.childrenTbl.put(curSymID, nextLayer);
+ }
+ pos = nextLayer;
+ }
+
+ // === add the rule into the trie node
+ if (!pos.hasRules()) {
+ pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getSource());
+ this.qtyRuleBins++;
+ }
+ pos.ruleBin.addRule(rule);
+ }
+
+ protected void printGrammar() {
+ LOG.info("{}: Read {} rules with {} distinct source sides from '{}'",
+ this.getClass().getName(), this.qtyRulesRead, this.qtyRuleBins, path);
+ }
+
+ /***
+ * Takes an input word and creates an OOV rule in the current grammar for that word.
+ *
+ * @param sourceWord integer representation of word
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ @Override
+ public void addOOVRules(int sourceWord, DecoderConfig config) {
+
+ // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+ // almost
+ // certainly is)
+ final int targetWord = config.getFlags().getBoolean("mark_oovs") ? Vocabulary.id(Vocabulary
+ .word(sourceWord) + "_OOV") : sourceWord;
+
+ final int[] sourceWords = { sourceWord };
+ final int[] targetWords = { targetWord };
+ final byte[] alignment = { 0, 0 };
+ final FeatureVector features = new FeatureVector(0);
+
+ final Rule oovRule = new Rule(
+ Vocabulary.id(config.getFlags().getString("default_non_terminal")),
+ sourceWords,
+ targetWords,
+ 0,
+ features,
+ alignment,
+ getOwner());
+ addRule(oovRule);
+ oovRule.estimateRuleCost(config.getFeatureFunctions());
+ }
+
+ /**
+ * Adds a default set of glue rules.
+ *
+ * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void addGlueRules(List<FeatureFunction> featureFunctions, Config config) {
+ String goalNT = FormatUtils.cleanNonTerminal(config.getString("goal_symbol"));
+ String defaultNT = FormatUtils.cleanNonTerminal(config.getString("default_non_terminal"));
+
+ String[] ruleStrings = new String[] {
+ String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+ Vocabulary.START_SYM),
+ String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT),
+ String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+ Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+ try(final HieroFormatReader reader = new HieroFormatReader(getOwner());) {
+ for (String ruleString : ruleStrings) {
+ Rule rule = reader.parseLine(ruleString);
+ addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammarFactory.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammarFactory.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammarFactory.java
new file mode 100644
index 0000000..e72f703
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/TextGrammarFactory.java
@@ -0,0 +1,148 @@
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import static java.util.Collections.emptyList;
+import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER;
+import static org.apache.joshua.util.Constants.CUSTOM_OWNER;
+import static org.apache.joshua.util.Constants.GLUE_OWNER;
+import static org.apache.joshua.util.Constants.OOV_OWNER;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.DecoderConfig;
+import org.apache.joshua.decoder.SearchAlgorithm;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.phrase.PhraseTable;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Node;
+import org.apache.joshua.util.FormatUtils;
+
+import com.google.common.collect.ImmutableMap;
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+
+/**
+ * Provides some static functions to create default/backoff/oov/glue TextGrammars
+ * that are dynamically created during decoding.
+ */
+public class TextGrammarFactory {
+
+ public static TextGrammar createGlueTextGrammar(String goalSymbol, String defaultNonTerminal) {
+ final Config config = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", GLUE_OWNER, "span_limit", "-1"), "Glue Grammar Config");
+ final TextGrammar glueGrammar = new TextGrammar(config);
+ final HieroFormatReader reader = new HieroFormatReader(glueGrammar.getOwner());
+ final String goalNT = FormatUtils.cleanNonTerminal(goalSymbol);
+ final String defaultNT = FormatUtils.cleanNonTerminal(defaultNonTerminal);
+
+ final String[] ruleStrings = new String[] {
+ String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+ Vocabulary.START_SYM),
+ String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT),
+ String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+ Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+ for (String ruleString : ruleStrings) {
+ Rule rule = reader.parseLine(ruleString);
+ glueGrammar.addRule(rule);
+ // glue rules do not any features
+ rule.estimateRuleCost(emptyList());
+ }
+ return glueGrammar;
+ }
+
+ public static Grammar createCustomGrammar(SearchAlgorithm searchAlgorithm) {
+ final Config config = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", CUSTOM_OWNER, "span_limit", "20"), "Custom Grammar Config");
+ switch (searchAlgorithm) {
+ case stack:
+ return new PhraseTable(config);
+ case cky:
+ return new TextGrammar(config);
+ default:
+ return null;
+ }
+ }
+
+ public static Grammar addEpsilonDeletingGrammar(String goalSymbol, String defaultNonTerminal) {
+ final Config config = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", "lattice", "span_limit", "-1"), "Epsilon Grammar Config");
+ final TextGrammar latticeGrammar = new TextGrammar(config);
+ final HieroFormatReader reader = new HieroFormatReader(latticeGrammar.getOwner());
+ final String goalNT = FormatUtils.cleanNonTerminal(goalSymbol);
+ final String defaultNT = FormatUtils.cleanNonTerminal(defaultNonTerminal);
+
+ //FIXME: arguments changed to match string format on best effort basis. Author please review.
+ final String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, defaultNT, defaultNT);
+
+ final Rule rule = reader.parseLine(ruleString);
+ latticeGrammar.addRule(rule);
+ rule.estimateRuleCost(emptyList());
+ return latticeGrammar;
+ }
+
+ public static Grammar createOovGrammarForSentence(final Sentence sentence, DecoderConfig config) {
+ final Config grammarConfig = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", OOV_OWNER, "span_limit", "20"), "OOV grammar config");
+ final TextGrammar oovGrammar = new TextGrammar(grammarConfig);
+ final Set<Integer> words = getOovCandidateWords(sentence, config.getFlags().getBoolean("true_oovs_only"));
+ for (int sourceWord: words) {
+ oovGrammar.addOOVRules(sourceWord, config);
+ }
+ // Sort all the rules (not much to actually do, this just marks it as sorted)
+ oovGrammar.sortGrammar(config.getFeatureFunctions());
+ return oovGrammar;
+ }
+
+ public static PhraseTable createOovPhraseTable(Sentence sentence, DecoderConfig config) {
+ final Config grammarConfig = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", OOV_OWNER, "span_limit", "0"), "OOV phrase table config");
+ final PhraseTable oovPhraseTable = new PhraseTable(grammarConfig);
+ final Set<Integer> words = getOovCandidateWords(sentence, config.getFlags().getBoolean("true_oovs_only"));
+ for (int sourceWord: words) {
+ oovPhraseTable.addOOVRules(sourceWord, config);
+ }
+ // Sort all the rules (not much to actually do, this just marks it as sorted)
+ oovPhraseTable.sortGrammar(config.getFeatureFunctions());
+ return oovPhraseTable;
+ }
+
+ /**
+ * Returns a set of integer ids for which OOV rules will be created.
+ * The set is determined by the flag trueOovsOnly.
+ */
+ private static Set<Integer> getOovCandidateWords(final Sentence sentence, boolean trueOovsOnly) {
+ final Set<Integer> words = new HashSet<>();
+ for (Node<Token> node : sentence.getLattice()) {
+ for (Arc<Token> arc : node.getOutgoingArcs()) {
+ int sourceWord = arc.getLabel().getWord();
+ if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
+ || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
+ continue;
+
+ // Determine if word is actual OOV.
+ if (trueOovsOnly && ! Vocabulary.hasId(sourceWord))
+ continue;
+
+ words.add(sourceWord);
+ }
+ }
+ return words;
+ }
+
+ public static PhraseTable createEndRulePhraseTable(Sentence sentence, DecoderConfig config) {
+ final Config grammarConfig = ConfigFactory.parseMap(
+ ImmutableMap.of("owner", UNKNOWN_OWNER, "span_limit", "0"), "End Rule Phrase Table Config");
+ final PhraseTable endRulePhraseTable = new PhraseTable(grammarConfig);
+ endRulePhraseTable.addRule(Hypothesis.END_RULE);
+ return endRulePhraseTable;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
index d98d76f..de809f6 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -80,7 +80,7 @@ import java.util.List;
import java.util.Map;
import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.DecoderConfig;
import org.apache.joshua.decoder.ff.FeatureFunction;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
@@ -99,8 +99,10 @@ import org.slf4j.LoggerFactory;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
+import com.google.common.base.Throwables;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
+import com.typesafe.config.Config;
public class PackedGrammar extends AbstractGrammar {
@@ -117,49 +119,50 @@ public class PackedGrammar extends AbstractGrammar {
// Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
private final Cache<Trie, List<Rule>> cached_rules;
- private final String grammarDir;
-
- private JoshuaConfiguration config;
-
- public PackedGrammar(String grammar_dir, int span_limit, String owner, String type,
- JoshuaConfiguration joshuaConfiguration) throws IOException {
- super(owner, joshuaConfiguration, span_limit);
+ private final String path;
- this.grammarDir = grammar_dir;
- this.config = joshuaConfiguration;
+ public PackedGrammar(Config config) {
+ super(config);
- // Read the vocabulary.
- vocabFile = new File(grammar_dir + File.separator + VOCABULARY_FILENAME);
- LOG.info("Reading vocabulary: {}", vocabFile);
- if (!Vocabulary.read(vocabFile)) {
- throw new RuntimeException("mismatches or collisions while reading on-disk vocabulary");
- }
-
- // Read the config
- String configFile = grammar_dir + File.separator + "config";
- if (new File(configFile).exists()) {
- LOG.info("Reading packed config: {}", configFile);
- readConfig(configFile);
- }
+ this.path = config.getString("path");
+ vocabFile = new File(path + File.separator + VOCABULARY_FILENAME);
- // Read the quantizer setup.
- LOG.info("Reading encoder configuration: {}{}encoding", grammar_dir, File.separator);
- encoding = new EncoderConfiguration();
- encoding.load(grammar_dir + File.separator + "encoding");
-
- final List<String> listing = Arrays.asList(new File(grammar_dir).list());
- sort(listing); // File.list() has arbitrary sort order
- slices = new ArrayList<>();
- for (String prefix : listing) {
- if (prefix.startsWith("slice_") && prefix.endsWith(".source"))
- slices.add(new PackedSlice(grammar_dir + File.separator + prefix.substring(0, 11)));
+ try {
+ // Read the vocabulary.
+ LOG.info("Reading vocabulary: {}", vocabFile);
+ if (!Vocabulary.read(vocabFile)) {
+ throw new RuntimeException("mismatches or collisions while reading on-disk vocabulary");
+ }
+
+ // Read the config
+ String configFile = path + File.separator + "config";
+ if (new File(configFile).exists()) {
+ LOG.info("Reading packed config: {}", configFile);
+ readConfig(configFile);
+ }
+
+ // Read the quantizer setup.
+ LOG.info("Reading encoder configuration: {}{}encoding", path, File.separator);
+ encoding = new EncoderConfiguration();
+ encoding.load(path + File.separator + "encoding");
+
+ final List<String> listing = Arrays.asList(new File(path).list());
+ sort(listing); // File.list() has arbitrary sort order
+ slices = new ArrayList<>();
+ for (String prefix : listing) {
+ if (prefix.startsWith("slice_") && prefix.endsWith(".source"))
+ slices.add(new PackedSlice(path + File.separator + prefix.substring(0, 11)));
+ }
+ } catch (IOException e) {
+ Throwables.propagate(e);
}
long count = 0;
for (PackedSlice s : slices)
count += s.estimated.length;
root = new PackedRoot(slices);
- cached_rules = CacheBuilder.newBuilder().maximumSize(joshuaConfiguration.cachedRuleSize).build();
+ int cacheSize = config.getInt("rule_cache_size");
+ cached_rules = CacheBuilder.newBuilder().maximumSize(cacheSize).build();
LOG.info("Loaded {} rules", count);
}
@@ -854,7 +857,7 @@ public class PackedGrammar extends AbstractGrammar {
@Override
public int[] getSource() {
int phrase[] = new int[src.length + 1];
- int ntid = Vocabulary.id(PackedGrammar.this.joshuaConfiguration.default_non_terminal);
+ int ntid = Vocabulary.id(PackedGrammar.this.config.getString("default_non_terminal"));
phrase[0] = ntid;
System.arraycopy(src, 0, phrase, 1, src.length);
return phrase;
@@ -956,9 +959,9 @@ public class PackedGrammar extends AbstractGrammar {
}
}
}
-
+
@Override
- public void addOOVRules(int word, List<FeatureFunction> featureFunctions) {
+ public void addOOVRules(int word, DecoderConfig config) {
throw new RuntimeException("PackedGrammar.addOOVRules(): I can't add OOV rules");
}
@@ -989,17 +992,15 @@ public class PackedGrammar extends AbstractGrammar {
if (! isSupportedVersion(version)) {
String message = String.format("The grammar at %s was packed with packer version %d, which is incompatible with the current config",
- this.grammarDir, version);
+ this.path, version);
throw new RuntimeException(message);
}
}
- /*
- * Determines whether the current grammar is a supported version. For hierarchical decoding,
- * no changes have occurred, so any version past 2 (the default) is supported. For phrase-
- * based decoding, version 4 is required.
- */
+ /**
+ * With Joshua 7 we require newly packed grammars for everything.
+ */
private boolean isSupportedVersion(int version) {
- return (config.search_algorithm.equals("cky") && version >= 2) || (version >= 4);
+ return version >= 3; // TODO(fhieber): fix this once we ship Joshua
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java b/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
index a1132e8..085b239 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
@@ -22,16 +22,17 @@ import java.io.PrintStream;
import java.util.HashSet;
import org.apache.joshua.corpus.Vocabulary;
-import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.tm.Grammar;
import org.apache.joshua.decoder.ff.tm.OwnerMap;
import org.apache.joshua.decoder.ff.tm.Rule;
-import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.ff.tm.hash_based.TextGrammar;
import org.apache.joshua.util.FormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.typesafe.config.Config;
+
/**
* This walker function builds up a new context-free grammar by visiting each node in a hypergraph.
* For a quick overview, see Chris Dyer's 2010 NAACL paper
@@ -49,13 +50,13 @@ public class GrammarBuilderWalkerFunction implements WalkerFunction {
private static final Logger LOG = LoggerFactory.getLogger(GrammarBuilderWalkerFunction.class);
- private final MemoryBasedBatchGrammar grammar;
+ private final TextGrammar grammar;
private final PrintStream outStream;
private final int goalSymbol;
private final HashSet<Rule> rules;
- public GrammarBuilderWalkerFunction(String goal, JoshuaConfiguration joshuaConfiguration, String owner) {
- grammar = new MemoryBasedBatchGrammar(owner, joshuaConfiguration, 1000);
+ public GrammarBuilderWalkerFunction(String goal, Config config) {
+ grammar = new TextGrammar(config);
outStream = null;
goalSymbol = Vocabulary.id(goal);
rules = new HashSet<>();
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java b/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
index cb79bf9..6494143 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
@@ -36,14 +36,12 @@ import java.util.PriorityQueue;
import org.apache.joshua.corpus.Vocabulary;
import org.apache.joshua.decoder.BLEU;
-import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.DecoderConfig;
import org.apache.joshua.decoder.StructuredTranslation;
import org.apache.joshua.decoder.StructuredTranslationFactory;
-import org.apache.joshua.decoder.ff.FeatureFunction;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.fragmentlm.Tree;
import org.apache.joshua.decoder.ff.state_maintenance.DPState;
-import org.apache.joshua.decoder.ff.tm.OwnerMap;
import org.apache.joshua.decoder.ff.tm.Rule;
import org.apache.joshua.decoder.io.DeNormalize;
import org.apache.joshua.decoder.segment_file.Sentence;
@@ -96,7 +94,6 @@ import org.apache.joshua.util.FormatUtils;
* @author Matt Post post@cs.jhu.edu
*/
public class KBestExtractor {
- private final JoshuaConfiguration joshuaConfiguration;
private final String outputFormat;
private final HashMap<HGNode, VirtualNode> virtualNodesTable = new HashMap<>();
@@ -117,33 +114,25 @@ public class KBestExtractor {
/* The input sentence */
private final Sentence sentence;
- /* The weights being used to score the forest */
- private final FeatureVector weights;
-
- /* The feature functions */
- private final List<FeatureFunction> featureFunctions;
+ /* The decoderConfig */
+ private final DecoderConfig config;
/* BLEU statistics of the references */
private BLEU.References references = null;
public KBestExtractor(
- Sentence sentence,
- List<FeatureFunction> featureFunctions,
- FeatureVector weights,
- boolean isMonolingual,
- JoshuaConfiguration joshuaConfiguration) {
-
- this.featureFunctions = featureFunctions;
+ final Sentence sentence,
+ final DecoderConfig config,
+ boolean isMonolingual) {
- this.joshuaConfiguration = joshuaConfiguration;
- this.outputFormat = this.joshuaConfiguration.outputFormat;
- this.extractUniqueNbest = joshuaConfiguration.use_unique_nbest;
+ this.config = config;
+ this.outputFormat = config.getFlags().getString("output_format");
+ this.extractUniqueNbest = config.getFlags().getBoolean("use_unique_nbest");
- this.weights = weights;
this.defaultSide = (isMonolingual ? Side.SOURCE : Side.TARGET);
this.sentence = sentence;
- if (joshuaConfiguration.rescoreForest) {
+ if (config.getFlags().getBoolean("rescore_forest")) {
references = new BLEU.References(sentence.references());
}
}
@@ -281,7 +270,7 @@ public class KBestExtractor {
private String maybeProjectCase(String hypothesis, DerivationState state) {
String output = hypothesis;
- if (joshuaConfiguration.project_case) {
+ if (config.getFlags().getBoolean("project_case")) {
String[] tokens = hypothesis.split("\\s+");
List<List<Integer>> points = state.getWordAlignmentList();
for (int i = 0; i < points.size(); i++) {
@@ -518,7 +507,7 @@ public class KBestExtractor {
+ virtualTailNode.nbests.get(newRanks[i] - 1).getModelCost();
nextState.setCost(cost);
- if (joshuaConfiguration.rescoreForest)
+ if (config.getFlags().getBoolean("rescore_forest"))
nextState.bleu = nextState.computeBLEU();
candHeap.add(nextState);
@@ -632,7 +621,7 @@ public class KBestExtractor {
cost = hyperEdge.getBestDerivationScore();
DerivationState state = new DerivationState(parentNode, hyperEdge, ranks, cost, edgePos);
- if (joshuaConfiguration.rescoreForest)
+ if (config.getFlags().getBoolean("rescore_forest"))
state.bleu = state.computeBLEU();
return state;
@@ -738,7 +727,7 @@ public class KBestExtractor {
* @return float representing model cost plus the BLEU score
*/
public float getCost() {
- return cost - weights.getOrDefault(hashFeature("BLEU")) * bleu;
+ return cost - config.getWeights().getOrDefault(hashFeature("BLEU")) * bleu;
}
public String toString() {
@@ -839,7 +828,7 @@ public class KBestExtractor {
}
public FeatureVector getFeatures() {
- final FeatureVectorExtractor extractor = new FeatureVectorExtractor(featureFunctions, sentence);
+ final FeatureVectorExtractor extractor = new FeatureVectorExtractor(config.getFeatureFunctions(), sentence);
visit(extractor);
return extractor.getFeatures();
}
@@ -1019,7 +1008,7 @@ public class KBestExtractor {
for (int i = 0; i < indent * 2; i++)
sb.append(" ");
- final FeatureVectorExtractor extractor = new FeatureVectorExtractor(featureFunctions, sentence);
+ final FeatureVectorExtractor extractor = new FeatureVectorExtractor(config.getFeatureFunctions(), sentence);
extractor.before(state, indent, tailNodeIndex);
final FeatureVector transitionFeatures = extractor.getFeatures();
@@ -1033,7 +1022,7 @@ public class KBestExtractor {
sb.append(" ").append(dpState);
}
sb.append(" ||| ").append(transitionFeatures);
- sb.append(" ||| ").append(weights.innerProduct(transitionFeatures));
+ sb.append(" ||| ").append(config.getWeights().innerProduct(transitionFeatures));
if (rule.getAlignment() != null)
sb.append(" ||| ").append(Arrays.toString(rule.getAlignment()));
sb.append("\n");
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java b/joshua-core/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java
index afb63ab..dcf50ad 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/io/TranslationRequestStream.java
@@ -22,12 +22,12 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
-import com.google.gson.stream.JsonReader;
-
-import org.apache.joshua.decoder.JoshuaConfiguration;
-import org.apache.joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
+import org.apache.joshua.decoder.InputType;
import org.apache.joshua.decoder.segment_file.Sentence;
+import com.google.gson.stream.JsonReader;
+import com.typesafe.config.Config;
+
/**
* This class iterates over an input stream, looking for inputs to translate. By default, it
* expects plain-text input, which can be plain sentences or PLF-encoded lattices. If
@@ -48,7 +48,7 @@ import org.apache.joshua.decoder.segment_file.Sentence;
* @author orluke
*/
public class TranslationRequestStream {
- private final JoshuaConfiguration joshuaConfiguration;
+ private final Config config;
private int sentenceNo = -1;
/* Plain text or JSON input */
@@ -57,10 +57,10 @@ public class TranslationRequestStream {
/* Whether the request has been killed by a broken client connection. */
private volatile boolean isShutDown = false;
- public TranslationRequestStream(BufferedReader reader, JoshuaConfiguration joshuaConfiguration) {
- this.joshuaConfiguration = joshuaConfiguration;
+ public TranslationRequestStream(BufferedReader reader, Config config) {
+ this.config = config;
- if (joshuaConfiguration.input_type == INPUT_TYPE.json) {
+ if (InputType.valueOf(config.getString("serverSettings.input_type")) == InputType.json) {
this.requestHandler = new JSONStreamHandler(reader);
} else {
this.requestHandler = new PlaintextStreamHandler(reader);
@@ -103,7 +103,7 @@ public class TranslationRequestStream {
if (line == null)
return null;
- return new Sentence(line, -1, joshuaConfiguration);
+ return new Sentence(line, -1, config);
}
}
@@ -121,7 +121,7 @@ public class TranslationRequestStream {
String line = reader.readLine();
if (line != null) {
- return new Sentence(line, sentenceNo, joshuaConfiguration);
+ return new Sentence(line, sentenceNo, config);
}
return null;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java
index 9f0dec1..a1745f2 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Candidate.java
@@ -18,6 +18,8 @@
*/
package org.apache.joshua.decoder.phrase;
+import static org.apache.joshua.decoder.chart_parser.ComputeNodeResult.computeNodeResult;
+
/***
* A candidate represents a translation hypothesis that may possibly be added to the translation
* hypergraph. It groups together (a) a set of translation hypotheses all having the same coverage
@@ -37,19 +39,16 @@ import java.util.Arrays;
import java.util.List;
import org.apache.joshua.corpus.Span;
-import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.DecoderConfig;
import org.apache.joshua.decoder.chart_parser.NodeResult;
-import org.apache.joshua.decoder.ff.FeatureFunction;
import org.apache.joshua.decoder.ff.state_maintenance.DPState;
import org.apache.joshua.decoder.ff.tm.Rule;
import org.apache.joshua.decoder.hypergraph.HGNode;
import org.apache.joshua.decoder.segment_file.Sentence;
-import static org.apache.joshua.decoder.chart_parser.ComputeNodeResult.computeNodeResult;
-
public class Candidate implements Comparable<Candidate> {
- private List<FeatureFunction> featureFunctions;
+ private final DecoderConfig config;
private Sentence sentence;
// the set of hypotheses that can be paired with phrases from this span
@@ -115,9 +114,9 @@ public class Candidate implements Comparable<Candidate> {
getHypothesis(), getPhraseNode().bestHyperedge.getRule().getTargetWords(), getSpan());
}
- public Candidate(List<FeatureFunction> featureFunctions, Sentence sentence,
+ public Candidate(DecoderConfig config, Sentence sentence,
List<Hypothesis> hypotheses, PhraseNodes phrases, float delta, int[] ranks) {
- this.featureFunctions = featureFunctions;
+ this.config = config;
this.sentence = sentence;
this.hypotheses = hypotheses;
this.phrases = phrases;
@@ -161,7 +160,7 @@ public class Candidate implements Comparable<Candidate> {
*/
public Candidate extendHypothesis() {
if (ranks[0] < hypotheses.size() - 1) {
- return new Candidate(featureFunctions, sentence, hypotheses, phrases, future_delta, new int[] { ranks[0] + 1, ranks[1] });
+ return new Candidate(config, sentence, hypotheses, phrases, future_delta, new int[] { ranks[0] + 1, ranks[1] });
}
return null;
}
@@ -173,7 +172,7 @@ public class Candidate implements Comparable<Candidate> {
*/
public Candidate extendPhrase() {
if (ranks[1] < phrases.size() - 1) {
- return new Candidate(featureFunctions, sentence, hypotheses, phrases, future_delta, new int[] { ranks[0], ranks[1] + 1 });
+ return new Candidate(config, sentence, hypotheses, phrases, future_delta, new int[] { ranks[0], ranks[1] + 1 });
}
return null;
@@ -232,7 +231,7 @@ public class Candidate implements Comparable<Candidate> {
if (computedResult == null) {
// add the rule
// TODO: sourcepath
- computedResult = computeNodeResult(featureFunctions, getRule(), getTailNodes(), getLastCovered(), getPhraseEnd(), null, sentence);
+ computedResult = computeNodeResult(config, getRule(), getTailNodes(), getLastCovered(), getPhraseEnd(), null, sentence);
}
return computedResult;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2edda0f/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java
index 8355eb5..6564e7b 100644
--- a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/PhraseChart.java
@@ -18,13 +18,14 @@
*/
package org.apache.joshua.decoder.phrase;
-import java.util.ArrayList;
+import static org.apache.joshua.decoder.chart_parser.ComputeNodeResult.computeNodeResult;
+
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.DecoderConfig;
import org.apache.joshua.decoder.chart_parser.NodeResult;
-import org.apache.joshua.decoder.ff.FeatureFunction;
import org.apache.joshua.decoder.ff.tm.Rule;
import org.apache.joshua.decoder.ff.tm.RuleCollection;
import org.apache.joshua.decoder.hypergraph.HGNode;
@@ -33,7 +34,7 @@ import org.apache.joshua.decoder.segment_file.Sentence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import static org.apache.joshua.decoder.chart_parser.ComputeNodeResult.computeNodeResult;
+import com.google.common.collect.ImmutableList;
/**
* This class represents a bundle of phrase tables that have been read in,
@@ -50,13 +51,13 @@ public class PhraseChart {
private final List<PhraseNodes> entries;
// number of translation options
- private int numOptions = 20;
+ private final int numOptions;
// The feature functions
- private final List<FeatureFunction> features;
+ private final DecoderConfig config;
// The input sentence
- private Sentence sentence;
+ private final Sentence sentence;
/**
* Create a new PhraseChart object, which represents all phrases that are
@@ -68,13 +69,13 @@ public class PhraseChart {
* @param source input to {@link org.apache.joshua.lattice.Lattice}
* @param num_options number of translation options (typically set to 20)
*/
- public PhraseChart(PhraseTable[] tables, List<FeatureFunction> features, Sentence source,
+ public PhraseChart(ImmutableList<PhraseTable> tables, DecoderConfig config, Sentence source,
int num_options) {
float startTime = System.currentTimeMillis();
this.numOptions = num_options;
- this.features = features;
+ this.config = config;
this.sentence = source;
max_source_phrase_length = 0;
@@ -193,7 +194,7 @@ public class PhraseChart {
* performance gains --- the more common the word, the more translations options it is
* likely to have (often into the tens of thousands).
*/
- List<Rule> rules = to.getSortedRules(features);
+ List<Rule> rules = to.getSortedRules(config.getFeatureFunctions());
// TODO: I think this is a race condition
if (numOptions > 0 && rules.size() > numOptions)
@@ -208,7 +209,7 @@ public class PhraseChart {
// Turn each rule into an HGNode, add them one by one
for (Rule rule: rules) {
- NodeResult result = computeNodeResult(features, rule, null, i, j, null, sentence);
+ NodeResult result = computeNodeResult(config, rule, null, i, j, null, sentence);
HyperEdge edge = new HyperEdge(rule, result.getViterbiCost(), result.getTransitionCost(), null, null);
HGNode phraseNode = new HGNode(i, j, rule.getLHS(), result.getDPStates(), edge, result.getPruningEstimate());
nodes.add(phraseNode);