You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:35 UTC
[19/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
new file mode 100644
index 0000000..7a3de23
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
@@ -0,0 +1,710 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import static joshua.util.FormatUtils.cleanNonTerminal;
+import static joshua.util.FormatUtils.markup;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.logging.Logger;
+
+import joshua.decoder.ff.StatefulFF;
+import joshua.decoder.ff.fragmentlm.Tree;
+import joshua.util.FormatUtils;
+import joshua.util.Regex;
+import joshua.util.io.LineReader;
+
+/**
+ * Configuration file for Joshua decoder.
+ *
+ * When adding new features to Joshua, any new configurable parameters should be added to this
+ * class.
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+public class JoshuaConfiguration {
+
+ // whether to construct a StructuredTranslation object for each request instead of
+ // printing to stdout. Used when the Decoder is used from Java directly.
+ public Boolean use_structured_output = false;
+
+ // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+ // original case
+ public boolean lowercase = false;
+
+ // If set to true, Joshua will recapitalize the output by projecting the case from aligned
+ // source-side words
+ public boolean project_case = false;
+
+ // List of grammar files to read
+ public ArrayList<String> tms = new ArrayList<String>();
+
+ // A rule cache for commonly used tries to avoid excess object allocations
+ // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
+ public Integer cachedRuleSize = new Integer(5000);
+
+ /*
+ * The file to read the weights from (part of the sparse features implementation). Weights can
+ * also just be listed in the main config file.
+ */
+ public String weights_file = "";
+
+ // Default symbols. The symbol here should be enclosed in square brackets.
+ public String default_non_terminal = FormatUtils.markup("X");
+ public String goal_symbol = FormatUtils.markup("GOAL");
+
+ /*
+ * A list of OOV symbols in the form
+ *
+ * [X1] weight [X2] weight [X3] weight ...
+ *
+ * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
+ * input sentence, Joshua will create rules of the form
+ *
+ * X1 -> w (weight)
+ *
+ * If this is empty, an unweighted default_non_terminal is used.
+ */
+
+ public class OOVItem implements Comparable<OOVItem> {
+ public String label;
+ public float weight;
+
+ OOVItem(String l, float w) {
+ label = l;
+ weight = w;
+ }
+
+ @Override
+ public int compareTo(OOVItem other) {
+ if (weight > other.weight)
+ return -1;
+ else if (weight < other.weight)
+ return 1;
+ return 0;
+ }
+ }
+ public ArrayList<OOVItem> oovList = null;
+
+ /*
+ * Whether to segment OOVs into a lattice
+ */
+ public boolean segment_oovs = false;
+
+ /*
+ * Enable lattice decoding.
+ */
+ public boolean lattice_decoding = false;
+
+ /*
+ * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
+ * sorted till they are first accessed. Amortized sorting means you get your first translation
+ * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
+ */
+ public boolean amortized_sorting = true;
+
+ // syntax-constrained decoding
+ public boolean constrain_parse = false;
+ public boolean use_pos_labels = false;
+
+ // oov-specific
+ public boolean true_oovs_only = false;
+
+ /* Dynamic sentence-level filtering. */
+ public boolean filter_grammar = false;
+
+ /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
+ public int pop_limit = 100;
+
+ /* Maximum sentence length. Sentences longer than this are truncated. */
+ public int maxlen = 200;
+
+ /*
+ * N-best configuration.
+ */
+ // Make sure output strings in the n-best list are unique.
+ public boolean use_unique_nbest = true;
+
+ /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
+ public boolean include_align_index = false;
+
+ /* The number of hypotheses to output by default. */
+ public int topN = 1;
+
+ /**
+ * This string describes the format of each line of output from the decoder (i.e., the
+ * translations). The string can include arbitrary text and also variables. The following
+ * variables are available:
+ *
+ * <pre>
+ * - %i the 0-indexed sentence number
+ * - %e the source string %s the translated sentence
+ * - %S the translated sentence with some basic capitalization and denormalization
+ * - %t the synchronous derivation
+ * - %f the list of feature values (as name=value pairs)
+ * - %c the model cost
+ * - %w the weight vector
+ * - %a the alignments between source and target words (currently unimplemented)
+ * - %d a verbose, many-line version of the derivation
+ * </pre>
+ */
+ public String outputFormat = "%i ||| %s ||| %f ||| %c";
+
+ /* The number of decoding threads to use (-threads). */
+ public int num_parallel_decoders = 1;
+
+ // disk hg
+ public String hypergraphFilePattern = "";
+
+ /*
+ * When true, _OOV is appended to all words that are passed through (useful for something like
+ * transliteration on the target side
+ */
+ public boolean mark_oovs = false;
+
+ /* Enables synchronous parsing. */
+ public boolean parse = false; // perform synchronous parsing
+
+ private final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());
+
+ /* A list of the feature functions. */
+ public ArrayList<String> features = new ArrayList<String>();
+
+ /* A list of weights found in the main config file (instead of in a separate weights file) */
+ public ArrayList<String> weights = new ArrayList<String>();
+
+ /* Determines whether to expect JSON input or plain lines */
+ public enum INPUT_TYPE { plain, json };
+ public INPUT_TYPE input_type = INPUT_TYPE.plain;
+
+ /* Type of server. Not sure we need to keep the regular TCP one around. */
+ public enum SERVER_TYPE { none, TCP, HTTP };
+ public SERVER_TYPE server_type = SERVER_TYPE.TCP;
+
+ /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
+ public int server_port = 0;
+
+ /*
+ * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
+ * the input sentences in the following format:
+ *
+ * input sentence ||| ||| reference1 ||| reference2 ...
+ *
+ * (The second field is reserved for the output sentence for alignment and forced decoding).
+ */
+
+ public boolean rescoreForest = false;
+ public float rescoreForestWeight = 10.0f;
+
+ /*
+ * Location of fragment mapping file, which maps flattened SCFG rules to their internal
+ * representation.
+ */
+ public String fragmentMapFile = null;
+
+ /*
+ * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
+ * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
+ */
+ public boolean fuzzy_matching = false;
+
+ public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
+
+ /***
+ * Phrase-based decoding parameters.
+ */
+
+ /* The search algorithm: currently either "cky" or "stack" */
+ public String search_algorithm = "cky";
+
+ /* The distortion limit */
+ public int reordering_limit = 8;
+
+ /* The number of target sides considered for each source side (after sorting by model weight) */
+ public int num_translation_options = 20;
+
+ /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
+ * version of Sennrich (SSST 2014)
+ */
+ public boolean use_dot_chart = true;
+
+ /* Moses compatibility */
+ public boolean moses = false;
+
+ /* If true, just print out the weights found in the config file, and exit. */
+ public boolean show_weights_and_quit = false;
+
+ /* Read input from a file (Moses compatible flag) */
+ public String input_file = null;
+
+ /* Write n-best output to this file */
+ public String n_best_file = null;
+
+ /* Whether to look at source side for special annotations */
+ public boolean source_annotations = false;
+
+ /* Weights overridden from the command line */
+ public String weight_overwrite = "";
+
+ /**
+ * This method resets the state of JoshuaConfiguration back to the state after initialization.
+ * This is useful when for example making different calls to the decoder within the same java
+ * program, which otherwise leads to potential errors due to inconsistent state as a result of
+ * loading the configuration multiple times without resetting etc.
+ *
+ * This leads to the insight that in fact it may be an even better idea to refactor the code and
+ * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
+ * shared static object. This is just a suggestion for the next step.
+ *
+ */
+ public void reset() {
+ logger.info("Resetting the JoshuaConfiguration to its defaults ...");
+ logger.info("\n\tResetting the StatefullFF global state index ...");
+ logger.info("\n\t...done");
+ StatefulFF.resetGlobalStateIndex();
+ tms = new ArrayList<String>();
+ weights_file = "";
+ default_non_terminal = "[X]";
+ oovList = new ArrayList<OOVItem>();
+ oovList.add(new OOVItem(default_non_terminal, 1.0f));
+ goal_symbol = "[GOAL]";
+ amortized_sorting = true;
+ constrain_parse = false;
+ use_pos_labels = false;
+ true_oovs_only = false;
+ filter_grammar = false;
+ pop_limit = 100;
+ maxlen = 200;
+ use_unique_nbest = false;
+ include_align_index = false;
+ topN = 1;
+ outputFormat = "%i ||| %s ||| %f ||| %c";
+ num_parallel_decoders = 1;
+ hypergraphFilePattern = "";
+ mark_oovs = false;
+ // oracleFile = null;
+ parse = false; // perform synchronous parsing
+ features = new ArrayList<String>();
+ weights = new ArrayList<String>();
+ server_port = 0;
+
+ reordering_limit = 8;
+ num_translation_options = 20;
+ logger.info("...done");
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ /**
+ * To process command-line options, we write them to a file that looks like the config file, and
+ * then call readConfigFile() on it. It would be more general to define a class that sits on a
+ * stream and knows how to chop it up, but this was quicker to implement.
+ */
+ public void processCommandLineOptions(String[] options) {
+ try {
+ File tmpFile = File.createTempFile("options", null, null);
+ PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
+
+ for (int i = 0; i < options.length; i++) {
+ String key = options[i].substring(1);
+ if (i + 1 == options.length || options[i + 1].startsWith("-")) {
+ // if this is the last item, or if the next item
+ // is another flag, then this is a boolean flag
+ out.println(key + " = true");
+
+ } else {
+ out.print(key + " =");
+ while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
+ out.print(String.format(" %s", options[i + 1]));
+ i++;
+ }
+ out.println();
+ }
+ }
+ out.close();
+ this.readConfigFile(tmpFile.getCanonicalPath());
+
+ tmpFile.delete();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ public void readConfigFile(String configFile) throws IOException {
+
+ LineReader configReader = new LineReader(configFile, false);
+ try {
+ for (String line : configReader) {
+ line = line.trim(); // .toLowerCase();
+
+ if (Regex.commentOrEmptyLine.matches(line))
+ continue;
+
+ /*
+ * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
+ * values. Parameters match the pattern "key = value"; all other substantive lines are
+ * interpreted as features.
+ */
+
+ if (line.indexOf("=") != -1) { // parameters; (not feature function)
+ String[] fds = Regex.equalsWithSpaces.split(line, 2);
+ if (fds.length < 2) {
+ Decoder.LOG(1, String.format("* WARNING: skipping config file line '%s'", line));
+ continue;
+ }
+
+ String parameter = normalize_key(fds[0]);
+
+ if (parameter.equals(normalize_key("lm"))) {
+ /* This is deprecated. This support old LM lines of the form
+ *
+ * lm = berkeleylm 5 false false 100 lm.gz
+ *
+ * LMs are now loaded as general feature functions, so we transform that to either
+ *
+ * feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+ *
+ * If the line were state minimizing:
+ *
+ * lm = kenlm 5 true false 100 lm.gz
+ *
+ * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+ */
+
+ String[] tokens = fds[1].split("\\s+");
+ if (tokens[2].equals("true"))
+ features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+ tokens[1], tokens[5]));
+ else
+ features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+ tokens[0], tokens[1], tokens[5]));
+
+ } else if (parameter.equals(normalize_key("tm"))) {
+ /* If found, convert old format:
+ * tm = TYPE OWNER MAXSPAN PATH
+ * to new format
+ * tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH
+ */
+ String tmLine = fds[1];
+
+ String[] tokens = fds[1].split("\\s+");
+ if (! tokens[1].startsWith("-")) { // old format
+ tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
+ Decoder.LOG(1, String.format("WARNING: Converting deprecated TM line from '%s' -> '%s'", fds[1], tmLine));
+ }
+ tms.add(tmLine);
+
+ } else if (parameter.equals("v")) {
+ Decoder.VERBOSE = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("parse"))) {
+ parse = Boolean.parseBoolean(fds[1]);
+ logger.finest(String.format("parse: %s", parse));
+
+ } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
+ hypergraphFilePattern = fds[1].trim();
+ logger
+ .finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));
+
+ } else if (parameter.equals(normalize_key("oov-list"))) {
+ if (new File(fds[1]).exists()) {
+ oovList = new ArrayList<OOVItem>();
+ try {
+ File file = new File(fds[1]);
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ try {
+ String str = br.readLine();
+ while (str != null) {
+ String[] tokens = str.trim().split("\\s+");
+
+ oovList.add(new OOVItem(FormatUtils.markup(tokens[0]),
+ (float) Math.log(Float.parseFloat(tokens[1]))));
+
+ str = br.readLine();
+ }
+ br.close();
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ Collections.sort(oovList);
+
+ } else {
+ String[] tokens = fds[1].trim().split("\\s+");
+ if (tokens.length % 2 != 0) {
+ System.err.println(String.format("* FATAL: invalid format for '%s'", fds[0]));
+ System.exit(1);
+ }
+
+ oovList = new ArrayList<OOVItem>();
+
+ for (int i = 0; i < tokens.length; i += 2)
+ oovList.add(new OOVItem(FormatUtils.markup(tokens[i]),
+ (float) Math.log(Float.parseFloat(tokens[i + 1]))));
+
+ Collections.sort(oovList);
+ }
+
+ } else if (parameter.equals(normalize_key("lattice-decoding"))) {
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("segment-oovs"))) {
+ segment_oovs = true;
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("default-non-terminal"))) {
+ default_non_terminal = markup(cleanNonTerminal(fds[1].trim()));
+ logger.finest(String.format("default_non_terminal: %s", default_non_terminal));
+
+ } else if (parameter.equals(normalize_key("goal-symbol"))) {
+ goal_symbol = markup(cleanNonTerminal(fds[1].trim()));
+ logger.finest("goalSymbol: " + goal_symbol);
+
+ } else if (parameter.equals(normalize_key("weights-file"))) {
+ weights_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("constrain_parse"))) {
+ constrain_parse = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("true_oovs_only"))) {
+ true_oovs_only = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("filter-grammar"))) {
+ filter_grammar = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("amortize"))) {
+ amortized_sorting = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_pos_labels"))) {
+ use_pos_labels = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
+ use_unique_nbest = Boolean.valueOf(fds[1]);
+ logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));
+
+ } else if (parameter.equals(normalize_key("output-format"))) {
+ outputFormat = fds[1];
+ logger.finest(String.format("output-format: %s", outputFormat));
+
+ } else if (parameter.equals(normalize_key("include_align_index"))) {
+ include_align_index = Boolean.valueOf(fds[1]);
+ logger.finest(String.format("include_align_index: %s", include_align_index));
+
+ } else if (parameter.equals(normalize_key("top_n"))) {
+ topN = Integer.parseInt(fds[1]);
+ logger.finest(String.format("topN: %s", topN));
+
+ } else if (parameter.equals(normalize_key("num_parallel_decoders"))
+ || parameter.equals(normalize_key("threads"))) {
+ num_parallel_decoders = Integer.parseInt(fds[1]);
+ if (num_parallel_decoders <= 0) {
+ throw new IllegalArgumentException(
+ "Must specify a positive number for num_parallel_decoders");
+ }
+ logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));
+
+ } else if (parameter.equals(normalize_key("mark_oovs"))) {
+ mark_oovs = Boolean.valueOf(fds[1]);
+ logger.finest(String.format("mark_oovs: %s", mark_oovs));
+
+ } else if (parameter.equals(normalize_key("pop-limit"))) {
+ pop_limit = Integer.parseInt(fds[1]);
+ logger.finest(String.format("pop-limit: %s", pop_limit));
+
+ } else if (parameter.equals(normalize_key("input-type"))) {
+ if (fds[1].equals("json"))
+ input_type = INPUT_TYPE.json;
+ else if (fds[1].equals("plain"))
+ input_type = INPUT_TYPE.plain;
+ else {
+ System.err.println(String.format("* FATAL: invalid server type '%s'", fds[1]));
+ System.exit(1);
+ }
+ logger.info(String.format(" input-type: %s", input_type));
+
+ } else if (parameter.equals(normalize_key("server-type"))) {
+ if (fds[1].toLowerCase().equals("tcp"))
+ server_type = SERVER_TYPE.TCP;
+ else if (fds[1].toLowerCase().equals("http"))
+ server_type = SERVER_TYPE.HTTP;
+
+ logger.info(String.format(" server-type: %s", server_type));
+
+ } else if (parameter.equals(normalize_key("server-port"))) {
+ server_port = Integer.parseInt(fds[1]);
+ logger.info(String.format(" server-port: %d", server_port));
+
+ } else if (parameter.equals(normalize_key("rescore-forest"))) {
+ rescoreForest = true;
+ logger.info(String.format(" rescore-forest: %s", rescoreForest));
+
+ } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
+ rescoreForestWeight = Float.parseFloat(fds[1]);
+ logger.info(String.format(" rescore-forest-weight: %f", rescoreForestWeight));
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // reset the maximum length
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals("c") || parameter.equals("config")) {
+ // this was used to send in the config file, just ignore it
+ ;
+
+ } else if (parameter.equals(normalize_key("feature-function"))) {
+ // add the feature to the list of features for later processing
+ features.add("feature_function = " + fds[1]);
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // add the feature to the list of features for later processing
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter
+ .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
+ fuzzy_matching = Boolean.parseBoolean(fds[1]);
+ logger.finest(String.format(fuzzy_matching + ": %s", fuzzy_matching));
+
+ } else if (parameter.equals(normalize_key("fragment-map"))) {
+ fragmentMapFile = fds[1];
+ Tree.readMapping(fragmentMapFile);
+
+ /** PHRASE-BASED PARAMETERS **/
+ } else if (parameter.equals(normalize_key("search"))) {
+ search_algorithm = fds[1];
+
+ if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
+ throw new RuntimeException(
+ "-search must be one of 'stack' (for phrase-based decoding) " +
+ "or 'cky' (for hierarchical / syntactic decoding)");
+ }
+
+ if (search_algorithm.equals("cky") && include_align_index) {
+ throw new RuntimeException(
+ "include_align_index is currently not supported with cky search");
+ }
+
+ } else if (parameter.equals(normalize_key("reordering-limit"))) {
+ reordering_limit = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("num-translation-options"))) {
+ num_translation_options = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("no-dot-chart"))) {
+ use_dot_chart = false;
+
+ } else if (parameter.equals(normalize_key("moses"))) {
+ moses = true; // triggers some Moses-specific compatibility options
+
+ } else if (parameter.equals(normalize_key("show-weights"))) {
+ show_weights_and_quit = true;
+
+ } else if (parameter.equals(normalize_key("n-best-list"))) {
+ // for Moses compatibility
+ String[] tokens = fds[1].split("\\s+");
+ n_best_file = tokens[0];
+ if (tokens.length > 1)
+ topN = Integer.parseInt(tokens[1]);
+
+ } else if (parameter.equals(normalize_key("input-file"))) {
+ // for Moses compatibility
+ input_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("weight-file"))) {
+ // for Moses, ignore
+
+ } else if (parameter.equals(normalize_key("weight-overwrite"))) {
+ weight_overwrite = fds[1];
+
+ } else if (parameter.equals(normalize_key("source-annotations"))) {
+ // Check source sentence
+ source_annotations = true;
+
+ } else if (parameter.equals(normalize_key("cached-rules-size"))) {
+ // Check source sentence
+ cachedRuleSize = Integer.parseInt(fds[1]);
+ } else if (parameter.equals(normalize_key("lowercase"))) {
+ lowercase = true;
+
+ } else if (parameter.equals(normalize_key("project-case"))) {
+ project_case = true;
+
+ } else {
+
+ if (parameter.equals(normalize_key("use-sent-specific-tm"))
+ || parameter.equals(normalize_key("add-combined-cost"))
+ || parameter.equals(normalize_key("use-tree-nbest"))
+ || parameter.equals(normalize_key("use-kenlm"))
+ || parameter.equals(normalize_key("useCubePrune"))
+ || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
+ || parameter.equals(normalize_key("regexp-grammar"))) {
+ logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));
+
+ } else {
+ logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
+ System.exit(1);
+ }
+ }
+
+ Decoder.LOG(1, String.format(" %s = '%s'", normalize_key(fds[0]), fds[1]));
+
+ } else {
+ /*
+ * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
+ * are feature values, which can be present in this file
+ */
+
+ weights.add(line);
+ }
+ }
+ } finally {
+ configReader.close();
+ }
+ }
+
+ /**
+ * Checks for invalid variable configurations
+ */
+ public void sanityCheck() {
+ }
+
+ /**
+ * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
+ * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
+ * camelCasing in paramter names without forcing the user to memorize them all. Here are some
+ * examples of equivalent ways to refer to parameter names:
+ *
+ * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
+ */
+ public static String normalize_key(String text) {
+ return text.replaceAll("[-_]", "").toLowerCase();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java b/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java
new file mode 100644
index 0000000..841f517
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaDecoder.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.net.InetSocketAddress;
+import java.util.logging.Logger;
+
+import com.sun.net.httpserver.HttpServer;
+
+import joshua.decoder.JoshuaConfiguration.SERVER_TYPE;
+import joshua.decoder.io.TranslationRequestStream;
+import joshua.server.TcpServer;
+import joshua.server.ServerThread;
+
+/**
+ * Implements decoder initialization, including interaction with <code>JoshuaConfiguration</code>
+ * and <code>DecoderThread</code>.
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @author Lane Schwartz <do...@users.sourceforge.net>
+ */
+public class JoshuaDecoder {
+
+ private static final Logger logger = Logger.getLogger(JoshuaDecoder.class.getName());
+
+ // ===============================================================
+ // Main
+ // ===============================================================
+ public static void main(String[] args) throws IOException {
+
+ JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
+ ArgsParser userArgs = new ArgsParser(args,joshuaConfiguration);
+
+ String logFile = System.getenv().get("JOSHUA") + "/logging.properties";
+ try {
+ java.util.logging.LogManager.getLogManager().readConfiguration(new FileInputStream(logFile));
+ } catch (IOException e) {
+ logger.warning("Couldn't initialize logging properties from '" + logFile + "'");
+ }
+
+ long startTime = System.currentTimeMillis();
+
+ /* Step-0: some sanity checking */
+ joshuaConfiguration.sanityCheck();
+
+ /* Step-1: initialize the decoder, test-set independent */
+ Decoder decoder = new Decoder(joshuaConfiguration, userArgs.getConfigFile());
+
+ Decoder.LOG(1, String.format("Model loading took %d seconds",
+ (System.currentTimeMillis() - startTime) / 1000));
+ Decoder.LOG(1, String.format("Memory used %.1f MB", ((Runtime.getRuntime().totalMemory() - Runtime
+ .getRuntime().freeMemory()) / 1000000.0)));
+
+ /* Step-2: Decoding */
+ // create a server if requested, which will create TranslationRequest objects
+ if (joshuaConfiguration.server_port > 0) {
+ int port = joshuaConfiguration.server_port;
+ if (joshuaConfiguration.server_type == SERVER_TYPE.TCP) {
+ new TcpServer(decoder, port, joshuaConfiguration).start();
+
+ } else if (joshuaConfiguration.server_type == SERVER_TYPE.HTTP) {
+ HttpServer server = HttpServer.create(new InetSocketAddress(port), 0);
+ Decoder.LOG(1, String.format("** HTTP Server running and listening on port %d.", port));
+ server.createContext("/", new ServerThread(null, decoder, joshuaConfiguration));
+ server.setExecutor(null); // creates a default executor
+ server.start();
+ } else {
+ System.err.println("* FATAL: unknown server type");
+ System.exit(1);
+ }
+ return;
+ }
+
+ // Create the n-best output stream
+ FileWriter out = null;
+ if (joshuaConfiguration.n_best_file != null)
+ out = new FileWriter(joshuaConfiguration.n_best_file);
+
+ // Create a TranslationRequest object, reading from a file if requested, or from STDIN
+ InputStream input = (joshuaConfiguration.input_file != null)
+ ? new FileInputStream(joshuaConfiguration.input_file)
+ : System.in;
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(input));
+ TranslationRequestStream fileRequest = new TranslationRequestStream(reader, joshuaConfiguration);
+ decoder.decodeAll(fileRequest, new PrintStream(System.out));
+
+ if (joshuaConfiguration.n_best_file != null)
+ out.close();
+
+ Decoder.LOG(1, "Decoding completed.");
+ Decoder.LOG(1, String.format("Memory used %.1f MB", ((Runtime.getRuntime().totalMemory() - Runtime
+ .getRuntime().freeMemory()) / 1000000.0)));
+
+ /* Step-3: clean up */
+ decoder.cleanUp();
+ Decoder.LOG(1, String.format("Total running time: %d seconds",
+ (System.currentTimeMillis() - startTime) / 1000));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/MetaDataException.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/MetaDataException.java b/src/main/java/org/apache/joshua/decoder/MetaDataException.java
new file mode 100644
index 0000000..932059c
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/MetaDataException.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+/*
+ * This class is used to capture metadata command to Joshua on input and pass them to the
+ * decoder.
+ */
+
+public class MetaDataException extends Exception {
+ private String type = null;
+ private String tokenString = null;
+
+ public MetaDataException(String message) {
+ int firstSpace = message.indexOf(' ');
+ if (firstSpace != -1) {
+ this.type = message.substring(1, firstSpace);
+ this.tokenString = message.substring(firstSpace + 1);
+ } else if (message.length() > 0) {
+ this.type = message.substring(1);
+ this.tokenString = "";
+ }
+ }
+
+ public String type() {
+ return this.type;
+ }
+
+ public String tokenString() {
+ return this.tokenString;
+ }
+
+ public String[] tokens(String regex) {
+ return this.tokenString.split(regex);
+ }
+
+ public String[] tokens() {
+ return this.tokens("\\s+");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java b/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java
new file mode 100644
index 0000000..9596ae0
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/NbestMinRiskReranker.java
@@ -0,0 +1,441 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Scanner;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.PriorityBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import joshua.util.Ngram;
+import joshua.util.Regex;
+
+
+/**
+ * this class implements: (1) nbest min risk (MBR) reranking using BLEU as a gain funtion.
+ * <p>
+ * This assume that the string is unique in the nbest list In Hiero, due to spurious ambiguity, a
+ * string may correspond to many possible derivations, and ideally the probability of a string
+ * should be the sum of all the derivataions leading to that string. But, in practice, one normally
+ * uses a Viterbi approximation: the probability of a string is its best derivation probability So,
+ * if one want to deal with spurious ambiguity, he/she should do that before calling this class
+ *
+ * @author Zhifei Li, <zh...@gmail.com>
+ */
+public class NbestMinRiskReranker {
+
+ // TODO: this functionality is not implemented yet; default is to produce 1best without any
+ // feature scores;
+ boolean produceRerankedNbest = false;
+
+ double scalingFactor = 1.0;
+
+ static int bleuOrder = 4;
+ static boolean doNgramClip = true;
+
+ static boolean useGoogleLinearCorpusGain = false;
+
+ final PriorityBlockingQueue<RankerResult> resultsQueue =
+ new PriorityBlockingQueue<RankerResult>();
+
+ public NbestMinRiskReranker(boolean produceRerankedNbest, double scalingFactor) {
+ this.produceRerankedNbest = produceRerankedNbest;
+ this.scalingFactor = scalingFactor;
+ }
+
+
+ public String processOneSent(List<String> nbest, int sentID) {
+ System.err.println("Now process sentence " + sentID);
+
+ // step-0: preprocess
+ // assumption: each hyp has a formate:
+ // "sent_id ||| hyp_itself ||| feature scores ||| linear-combination-of-feature-scores(this should be logP)"
+
+ /* Quit if you find an empty hypothesis. */
+ if (nbest.size() == 1) {
+ String[] fields = Regex.threeBarsWithSpace.split(nbest.get(0));
+ if (fields[1].equals("") || Regex.spaces.matches(fields[1])) {
+ System.err.println(String.format("-> sentence is empty"));
+ return "";
+ }
+ }
+
+ List<String> hypsItself = new ArrayList<String>();
+ // ArrayList<String> l_feat_scores = new ArrayList<String>();
+ List<Double> baselineScores = new ArrayList<Double>(); // linear combination of all baseline
+ // features
+ List<HashMap<String, Integer>> ngramTbls = new ArrayList<HashMap<String, Integer>>();
+ List<Integer> sentLens = new ArrayList<Integer>();
+
+ for (String hyp : nbest) {
+ String[] fds = Regex.threeBarsWithSpace.split(hyp);
+ int tSentID = Integer.parseInt(fds[0]);
+ if (sentID != tSentID) {
+ throw new RuntimeException("sentence_id does not match");
+ }
+ String hypothesis = (fds.length >= 4) ? fds[1] : "";
+ hypsItself.add(hypothesis);
+
+ String[] words = Regex.spaces.split(hypothesis);
+ sentLens.add(words.length);
+
+ HashMap<String, Integer> ngramTbl = new HashMap<String, Integer>();
+ Ngram.getNgrams(ngramTbl, 1, bleuOrder, words);
+ ngramTbls.add(ngramTbl);
+
+ // l_feat_scores.add(fds[2]);
+
+ // The value of finalIndex is expected to be 3,
+ // unless the hyp_itself is empty,
+ // in which case finalIndex will be 2.
+ int finalIndex = fds.length - 1;
+ baselineScores.add(Double.parseDouble(fds[finalIndex]));
+
+ }
+
+ // step-1: get normalized distribution
+
+ /**
+ * value in baselineScores will be changed to normalized probability
+ * */
+ computeNormalizedProbs(baselineScores, scalingFactor);
+
+ List<Double> normalizedProbs = baselineScores;
+
+ // === required by google linear corpus gain
+ HashMap<String, Double> posteriorCountsTbl = null;
+ if (useGoogleLinearCorpusGain) {
+ posteriorCountsTbl = new HashMap<String, Double>();
+ getGooglePosteriorCounts(ngramTbls, normalizedProbs, posteriorCountsTbl);
+ }
+
+
+ // step-2: rerank the nbest
+ /**
+ * TODO: zhifei: now the re-ranking takes O(n^2) where n is the size of the nbest. But, we can
+ * significantly speed up this (leadding to O(n)) by first estimating a model on nbest, and then
+ * rerank the nbest using the estimated model.
+ * */
+ double bestGain = -1000000000;// set as worst gain
+ String bestHyp = null;
+ List<Double> gains = new ArrayList<Double>();
+ for (int i = 0; i < hypsItself.size(); i++) {
+ String curHyp = hypsItself.get(i);
+ int curHypLen = sentLens.get(i);
+ HashMap<String, Integer> curHypNgramTbl = ngramTbls.get(i);
+ // double cur_gain = computeGain(cur_hyp, l_hyp_itself, l_normalized_probs);
+ double curGain = 0;
+ if (useGoogleLinearCorpusGain) {
+ curGain = computeExpectedLinearCorpusGain(curHypLen, curHypNgramTbl, posteriorCountsTbl);
+ } else {
+ curGain =
+ computeExpectedGain(curHypLen, curHypNgramTbl, ngramTbls, sentLens, normalizedProbs);
+ }
+
+ gains.add(curGain);
+ if (i == 0 || curGain > bestGain) { // maximize
+ bestGain = curGain;
+ bestHyp = curHyp;
+ }
+ }
+
+ // step-3: output the 1best or nbest
+ if (this.produceRerankedNbest) {
+ // TOTO: sort the list and write the reranked nbest; Use Collections.sort(List list,
+ // Comparator c)
+ } else {
+ /*
+ * this.out.write(best_hyp); this.out.write("\n"); out.flush();
+ */
+ }
+
+ System.err.println("best gain: " + bestGain);
+ if (null == bestHyp) {
+ throw new RuntimeException("mbr reranked one best is null, must be wrong");
+ }
+ return bestHyp;
+ }
+
+
+ /**
+ * based on a list of log-probabilities in nbestLogProbs, obtain a normalized distribution, and
+ * put the normalized probability (real value in [0,1]) into nbestLogProbs
+ * */
+ // get a normalized distributeion and put it back to nbestLogProbs
+ static public void computeNormalizedProbs(List<Double> nbestLogProbs, double scalingFactor) {
+
+ // === get noralization constant, remember features, remember the combined linear score
+ double normalizationConstant = Double.NEGATIVE_INFINITY;// log-semiring
+
+ for (double logp : nbestLogProbs) {
+ normalizationConstant = addInLogSemiring(normalizationConstant, logp * scalingFactor, 0);
+ }
+ // System.out.println("normalization_constant (logP) is " + normalization_constant);
+
+ // === get normalized prob for each hyp
+ double tSum = 0;
+ for (int i = 0; i < nbestLogProbs.size(); i++) {
+
+ double normalizedProb =
+ Math.exp(nbestLogProbs.get(i) * scalingFactor - normalizationConstant);
+ tSum += normalizedProb;
+ nbestLogProbs.set(i, normalizedProb);
+
+ if (Double.isNaN(normalizedProb)) {
+ throw new RuntimeException("prob is NaN, must be wrong\nnbest_logps.get(i): "
+ + nbestLogProbs.get(i) + "; scaling_factor: " + scalingFactor
+ + "; normalization_constant:" + normalizationConstant);
+ }
+ // logger.info("probability: " + normalized_prob);
+ }
+
+ // sanity check
+ if (Math.abs(tSum - 1.0) > 1e-4) {
+ throw new RuntimeException("probabilities not sum to one, must be wrong");
+ }
+
+ }
+
+
+ // Gain(e) = negative risk = \sum_{e'} G(e, e')P(e')
+ // curHyp: e
+ // trueHyp: e'
+ public double computeExpectedGain(int curHypLen, HashMap<String, Integer> curHypNgramTbl,
+ List<HashMap<String, Integer>> ngramTbls, List<Integer> sentLens, List<Double> nbestProbs) {
+
+ // ### get noralization constant, remember features, remember the combined linear score
+ double gain = 0;
+
+ for (int i = 0; i < nbestProbs.size(); i++) {
+ HashMap<String, Integer> trueHypNgramTbl = ngramTbls.get(i);
+ double trueProb = nbestProbs.get(i);
+ int trueLen = sentLens.get(i);
+ gain +=
+ trueProb
+ * BLEU.computeSentenceBleu(trueLen, trueHypNgramTbl, curHypLen, curHypNgramTbl,
+ doNgramClip, bleuOrder);
+ }
+ // System.out.println("Gain is " + gain);
+ return gain;
+ }
+
+ // Gain(e) = negative risk = \sum_{e'} G(e, e')P(e')
+ // curHyp: e
+ // trueHyp: e'
+ static public double computeExpectedGain(String curHyp, List<String> nbestHyps,
+ List<Double> nbestProbs) {
+ // ### get noralization constant, remember features, remember the combined linear score
+ double gain = 0;
+
+ for (int i = 0; i < nbestHyps.size(); i++) {
+ String trueHyp = nbestHyps.get(i);
+ double trueProb = nbestProbs.get(i);
+ gain += trueProb * BLEU.computeSentenceBleu(trueHyp, curHyp, doNgramClip, bleuOrder);
+ }
+ // System.out.println("Gain is " + gain);
+ return gain;
+ }
+
+ void getGooglePosteriorCounts(List<HashMap<String, Integer>> ngramTbls,
+ List<Double> normalizedProbs, HashMap<String, Double> posteriorCountsTbl) {
+ // TODO
+ }
+
+ double computeExpectedLinearCorpusGain(int curHypLen, HashMap<String, Integer> curHypNgramTbl,
+ HashMap<String, Double> posteriorCountsTbl) {
+ // TODO
+ double[] thetas = {-1, 1, 1, 1, 1};
+
+ double res = 0;
+ res += thetas[0] * curHypLen;
+ for (Entry<String, Integer> entry : curHypNgramTbl.entrySet()) {
+ String key = entry.getKey();
+ String[] tem = Regex.spaces.split(key);
+
+ double post_prob = posteriorCountsTbl.get(key);
+ res += entry.getValue() * post_prob * thetas[tem.length];
+ }
+ return res;
+ }
+
+ // OR: return Math.log(Math.exp(x) + Math.exp(y));
+ static private double addInLogSemiring(double x, double y, int addMode) {// prevent over-flow
+ if (addMode == 0) { // sum
+ if (x == Double.NEGATIVE_INFINITY) {// if y is also n-infinity, then return n-infinity
+ return y;
+ }
+ if (y == Double.NEGATIVE_INFINITY) {
+ return x;
+ }
+
+ if (y <= x) {
+ return x + Math.log(1 + Math.exp(y - x));
+ } else {
+ return y + Math.log(1 + Math.exp(x - y));
+ }
+ } else if (addMode == 1) { // viter-min
+ return (x <= y) ? x : y;
+ } else if (addMode == 2) { // viter-max
+ return (x >= y) ? x : y;
+ } else {
+ throw new RuntimeException("invalid add mode");
+ }
+ }
+
+
+
+ public static void main(String[] args) throws IOException {
+
+ // If you don't know what to use for scaling factor, try using 1
+
+ if (args.length < 2) {
+ System.err
+ .println("usage: java NbestMinRiskReranker <produce_reranked_nbest> <scaling_factor> [numThreads]");
+ return;
+ }
+ long startTime = System.currentTimeMillis();
+ boolean produceRerankedNbest = Boolean.valueOf(args[0].trim());
+ double scalingFactor = Double.parseDouble(args[1].trim());
+ int numThreads = (args.length > 2) ? Integer.parseInt(args[2].trim()) : 1;
+
+
+ NbestMinRiskReranker mbrReranker =
+ new NbestMinRiskReranker(produceRerankedNbest, scalingFactor);
+
+ System.err.println("##############running mbr reranking");
+
+ int oldSentID = -1;
+ List<String> nbest = new ArrayList<String>();
+
+ Scanner scanner = new Scanner(System.in, "UTF-8");
+
+ if (numThreads == 1) {
+
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ String[] fds = Regex.threeBarsWithSpace.split(line);
+ int newSentID = Integer.parseInt(fds[0]);
+ if (oldSentID != -1 && oldSentID != newSentID) {
+ if (nbest.size() > 0) {
+ String best_hyp = mbrReranker.processOneSent(nbest, oldSentID);// nbest: list of unique
+ // strings
+ System.out.println(best_hyp);
+ } else {
+ System.out.println();
+ }
+ nbest.clear();
+ }
+ oldSentID = newSentID;
+ if (!fds[1].matches("^\\s*$")) nbest.add(line);
+ }
+
+ // last nbest
+ if (oldSentID >= 0) {
+ String bestHyp = mbrReranker.processOneSent(nbest, oldSentID);
+ System.out.println(bestHyp);
+ nbest.clear();
+ }
+
+ } else {
+
+ ExecutorService threadPool = Executors.newFixedThreadPool(numThreads);
+
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ String[] fds = Regex.threeBarsWithSpace.split(line);
+ int newSentID = Integer.parseInt(fds[0]);
+ if (oldSentID != -1 && oldSentID != newSentID) {
+
+ threadPool.execute(mbrReranker.new RankerTask(nbest, oldSentID));
+
+ nbest.clear();
+ }
+ oldSentID = newSentID;
+ nbest.add(line);
+ }
+
+ // last nbest
+ threadPool.execute(mbrReranker.new RankerTask(nbest, oldSentID));
+ nbest.clear();
+
+ threadPool.shutdown();
+
+ try {
+ threadPool.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS);
+
+ while (!mbrReranker.resultsQueue.isEmpty()) {
+ RankerResult result = mbrReranker.resultsQueue.remove();
+ String best_hyp = result.toString();
+ System.out.println(best_hyp);
+ }
+
+
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ scanner.close();
+
+ System.err.println("Total running time (seconds) is "
+ + (System.currentTimeMillis() - startTime) / 1000.0);
+ }
+
+ private class RankerTask implements Runnable {
+
+ final List<String> nbest;
+ final int sentID;
+
+ RankerTask(final List<String> nbest, final int sentID) {
+ this.nbest = new ArrayList<String>(nbest);
+ this.sentID = sentID;
+ }
+
+ public void run() {
+ String result = processOneSent(nbest, sentID);
+ resultsQueue.add(new RankerResult(result, sentID));
+ }
+
+ }
+
+ private static class RankerResult implements Comparable<RankerResult> {
+ final String result;
+ final Integer sentenceNumber;
+
+ RankerResult(String result, int sentenceNumber) {
+ this.result = result;
+ this.sentenceNumber = sentenceNumber;
+ }
+
+ public int compareTo(RankerResult o) {
+ return sentenceNumber.compareTo(o.sentenceNumber);
+ }
+
+ public String toString() {
+ return result;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
new file mode 100644
index 0000000..7b2185f
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
+import static joshua.util.FormatUtils.removeSentenceMarkers;
+
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * structuredTranslation provides a more structured access to translation
+ * results than the Translation class.
+ * Members of instances of this class can be used upstream.
+ * <br/>
+ * TODO:
+ * Enable K-Best extraction.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslation {
+
+ private final Sentence sourceSentence;
+ private final String translationString;
+ private final List<String> translationTokens;
+ private final float translationScore;
+ private final List<List<Integer>> translationWordAlignments;
+ private final Map<String,Float> translationFeatures;
+ private final float extractionTime;
+
+ public StructuredTranslation(final Sentence sourceSentence,
+ final HyperGraph hypergraph,
+ final List<FeatureFunction> featureFunctions) {
+
+ final long startTime = System.currentTimeMillis();
+
+ this.sourceSentence = sourceSentence;
+ this.translationString = removeSentenceMarkers(getViterbiString(hypergraph));
+ this.translationTokens = extractTranslationTokens();
+ this.translationScore = extractTranslationScore(hypergraph);
+ this.translationFeatures = getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap();
+ this.translationWordAlignments = getViterbiWordAlignmentList(hypergraph);
+ this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+ }
+
+ private float extractTranslationScore(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return 0;
+ } else {
+ return hypergraph.goalNode.getScore();
+ }
+ }
+
+ private List<String> extractTranslationTokens() {
+ if (translationString.isEmpty()) {
+ return emptyList();
+ } else {
+ return asList(translationString.split("\\s+"));
+ }
+ }
+
+ // Getters to use upstream
+
+ public Sentence getSourceSentence() {
+ return sourceSentence;
+ }
+
+ public int getSentenceId() {
+ return sourceSentence.id();
+ }
+
+ public String getTranslationString() {
+ return translationString;
+ }
+
+ public List<String> getTranslationTokens() {
+ return translationTokens;
+ }
+
+ public float getTranslationScore() {
+ return translationScore;
+ }
+
+ /**
+ * Returns a list of target to source alignments.
+ */
+ public List<List<Integer>> getTranslationWordAlignments() {
+ return translationWordAlignments;
+ }
+
+ public Map<String,Float> getTranslationFeatures() {
+ return translationFeatures;
+ }
+
+ /**
+ * Time taken to build output information from the hypergraph.
+ */
+ public Float getExtractionTime() {
+ return extractionTime;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/Support.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Support.java b/src/main/java/org/apache/joshua/decoder/Support.java
new file mode 100644
index 0000000..af33ec5
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/Support.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import java.util.List;
+
+/**
+ * @author Zhifei Li, <zh...@gmail.com>
+ */
+public class Support {
+
+ public static double findMin(double a, double b) {
+ return (a <= b) ? a : b;
+ }
+
+ public static double findMax(double a, double b) {
+ return (a > b) ? a : b;
+ }
+
+
+ public static int[] toArray(List<Integer> in) {
+ return subIntArray(in, 0, in.size());
+ }
+
+ /**
+ * @param start inclusive
+ * @param end exclusive
+ */
+ public static int[] subIntArray(List<Integer> in, int start, int end) {
+ int[] res = new int[end - start];
+ for (int i = start; i < end; i++) {
+ res[i - start] = in.get(i);
+ }
+ return res;
+ }
+
+ public static long current_time() {
+ return 0;
+ // return System.currentTimeMillis();
+ // return System.nanoTime();
+ }
+
+ // Only used in LMGrammarJAVA
+ public static long getMemoryUse() {
+ putOutTheGarbage();
+ long totalMemory = Runtime.getRuntime().totalMemory();// all the memory I get from the system
+ putOutTheGarbage();
+ long freeMemory = Runtime.getRuntime().freeMemory();
+ return (totalMemory - freeMemory) / 1024;// in terms of kb
+ }
+
+ private static void putOutTheGarbage() {
+ collectGarbage();
+ collectGarbage();
+ }
+
+ private static void collectGarbage() {
+ long fSLEEP_INTERVAL = 100;
+ try {
+ System.gc();
+ Thread.sleep(fSLEEP_INTERVAL);
+ System.runFinalization();
+ Thread.sleep(fSLEEP_INTERVAL);
+
+ } catch (InterruptedException ex) {
+ ex.printStackTrace();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Translation.java b/src/main/java/org/apache/joshua/decoder/Translation.java
new file mode 100644
index 0000000..8004d9f
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/Translation.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments;
+import static joshua.util.FormatUtils.removeSentenceMarkers;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.List;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.ff.lm.StateMinimizingLanguageModel;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.KBestExtractor;
+import joshua.decoder.io.DeNormalize;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * This class represents translated input objects (sentences or lattices). It is aware of the source
+ * sentence and id and contains the decoded hypergraph. Translation objects are returned by
+ * DecoderThread instances to the InputHandler, where they are assembled in order for output.
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+
+public class Translation {
+ private Sentence source;
+
+ /**
+ * This stores the output of the translation so we don't have to hold onto the hypergraph while we
+ * wait for the outputs to be assembled.
+ */
+ private String output = null;
+
+ private StructuredTranslation structuredTranslation = null;
+
+ public Translation(Sentence source, HyperGraph hypergraph,
+ List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) {
+ this.source = source;
+
+ if (joshuaConfiguration.use_structured_output) {
+
+ structuredTranslation = new StructuredTranslation(
+ source, hypergraph, featureFunctions);
+ this.output = structuredTranslation.getTranslationString();
+
+ } else {
+
+ StringWriter sw = new StringWriter();
+ BufferedWriter out = new BufferedWriter(sw);
+
+ try {
+ if (hypergraph != null) {
+ if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
+ hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions);
+ }
+
+ long startTime = System.currentTimeMillis();
+
+ // We must put this weight as zero, otherwise we get an error when we try to retrieve it
+ // without checking
+ Decoder.weights.increment("BLEU", 0);
+
+ if (joshuaConfiguration.topN == 0) {
+
+ /* construct Viterbi output */
+ final String best = getViterbiString(hypergraph);
+
+ Decoder.LOG(1, String.format("Translation %d: %.3f %s", source.id(), hypergraph.goalNode.getScore(),
+ best));
+
+ /*
+ * Setting topN to 0 turns off k-best extraction, in which case we need to parse through
+ * the output-string, with the understanding that we can only substitute variables for the
+ * output string, sentence number, and model score.
+ */
+ String translation = joshuaConfiguration.outputFormat
+ .replace("%s", removeSentenceMarkers(best))
+ .replace("%S", DeNormalize.processSingleLine(best))
+ .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore()))
+ .replace("%i", String.format("%d", source.id()));
+
+ if (joshuaConfiguration.outputFormat.contains("%a")) {
+ translation = translation.replace("%a", getViterbiWordAlignments(hypergraph));
+ }
+
+ if (joshuaConfiguration.outputFormat.contains("%f")) {
+ final FeatureVector features = getViterbiFeatures(hypergraph, featureFunctions, source);
+ translation = translation.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString());
+ }
+
+ out.write(translation);
+ out.newLine();
+
+ } else {
+
+ final KBestExtractor kBestExtractor = new KBestExtractor(
+ source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+
+ if (joshuaConfiguration.rescoreForest) {
+ Decoder.weights.increment("BLEU", joshuaConfiguration.rescoreForestWeight);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+
+ Decoder.weights.increment("BLEU", -joshuaConfiguration.rescoreForestWeight);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+ }
+ }
+
+ float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f;
+ Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(),
+ joshuaConfiguration.topN, seconds));
+
+ } else {
+
+ // Failed translations and blank lines get empty formatted outputs
+ // @formatter:off
+ String outputString = joshuaConfiguration.outputFormat
+ .replace("%s", source.source())
+ .replace("%e", "")
+ .replace("%S", "")
+ .replace("%t", "()")
+ .replace("%i", Integer.toString(source.id()))
+ .replace("%f", "")
+ .replace("%c", "0.000");
+ // @formatter:on
+
+ out.write(outputString);
+ out.newLine();
+ }
+
+ out.flush();
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ this.output = sw.toString();
+
+ }
+
+ /*
+ * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
+ * objects for this sentence.
+ */
+ for (FeatureFunction feature : featureFunctions) {
+ if (feature instanceof StateMinimizingLanguageModel) {
+ ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
+ break;
+ }
+ }
+
+ }
+
+ public Sentence getSourceSentence() {
+ return this.source;
+ }
+
+ public int id() {
+ return source.id();
+ }
+
+ @Override
+ public String toString() {
+ return output;
+ }
+
+ /**
+ * Returns the StructuredTranslation object
+ * if JoshuaConfiguration.construct_structured_output == True.
+ * @throws RuntimeException if StructuredTranslation object not set.
+ * @return
+ */
+ public StructuredTranslation getStructuredTranslation() {
+ if (structuredTranslation == null) {
+ throw new RuntimeException("No StructuredTranslation object created. You should set JoshuaConfigration.construct_structured_output = true");
+ }
+ return structuredTranslation;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/Translations.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Translations.java b/src/main/java/org/apache/joshua/decoder/Translations.java
new file mode 100644
index 0000000..e6ba9e6
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/Translations.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import java.util.LinkedList;
+import joshua.decoder.io.TranslationRequestStream;
+
+/**
+ * This class represents a streaming sequence of translations. It is returned by the main entry
+ * point to the Decoder object, the call to decodeAll. The translations here are parallel to the
+ * input sentences in the corresponding TranslationRequest object. Because of parallelization, the
+ * translated sentences might be computed out of order. Each Translation is sent to this
+ * Translations object by a DecoderThreadRunner via the record() function, which places the
+ * Translation in the right place. When the next translation in a sequence is available, next() is
+ * notified.
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+public class Translations {
+
+ /* The source sentences to be translated. */
+ private TranslationRequestStream request = null;
+
+ /*
+ * This records the index of the sentence at the head of the underlying list. The iterator's
+ * next() blocks when the value at this position in the translations LinkedList is null.
+ */
+ private int currentID = 0;
+
+ /* The set of translated sentences. */
+ private LinkedList<Translation> translations = null;
+
+ private boolean spent = false;
+
+ public Translations(TranslationRequestStream request) {
+ this.request = request;
+ this.translations = new LinkedList<Translation>();
+ }
+
+ /**
+ * This is called when null is received from the TranslationRequest, indicating that there are no
+ * more input sentences to translated. That in turn means that the request size will no longer
+ * grow. We then notify any waiting thread if the last ID we've processed is the last one, period.
+ */
+ public void finish() {
+ synchronized (this) {
+ spent = true;
+ if (currentID == request.size()) {
+ this.notifyAll();
+ }
+ }
+ }
+
+ /**
+ * This is called whenever a translation is completed by one of the decoder threads. There may be
+ * a current output thread waiting for the current translation, which is determined by checking if
+ * the ID of the translation is the same as the one being waited for (currentID). If so, the
+ * thread waiting for it is notified.
+ *
+ * @param translation
+ */
+ public void record(Translation translation) {
+ synchronized (this) {
+
+ /* Pad the set of translations with nulls to accommodate the new translation. */
+ int offset = translation.id() - currentID;
+ while (offset >= translations.size())
+ translations.add(null);
+ translations.set(offset, translation);
+
+ /*
+ * If the id of the current translation is at the head of the list (first element), then we
+ * have the next Translation to be return, and we should notify anyone waiting on next(),
+ * which will then remove the item and increment the currentID.
+ */
+ if (translation.id() == currentID) {
+ this.notify();
+ }
+ }
+ }
+
+ /**
+ * Returns the next Translation, blocking if necessary until it's available, since the next
+ * Translation might not have been produced yet.
+ */
+ public Translation next() {
+ synchronized (this) {
+
+ /*
+ * If there are no more input sentences, and we've already distributed what we then know is
+ * the last one, we're done.
+ */
+ if (spent && currentID == request.size())
+ return null;
+
+ /*
+ * Otherwise, there is another sentence. If it's not available already, we need to wait for
+ * it.
+ */
+ if (translations.size() == 0 || translations.peek() == null) {
+ try {
+ this.wait();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ /* We now have the sentence and can return it. */
+ currentID++;
+ return translations.poll();
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java b/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java
new file mode 100644
index 0000000..d8d16d8
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Cell.java
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder.chart_parser;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.logging.Logger;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.state_maintenance.DPState;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.HGNode;
+import joshua.decoder.hypergraph.HyperEdge;
+
+/**
+ * this class implement functions: (1) combine small itesm into larger ones using rules, and create
+ * items and hyper-edges to construct a hyper-graph, (2) evaluate model score for items, (3)
+ * cube-pruning Note: Bin creates Items, but not all Items will be used in the hyper-graph
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ * @author Zhifei Li, <zh...@gmail.com>
+ */
+class Cell {
+
+ // The chart this cell belongs to
+ private Chart chart = null;
+
+ // The top-level (goal) symbol
+ private int goalSymbol;
+
+ // to maintain uniqueness of nodes
+ private HashMap<HGNode.Signature, HGNode> nodesSigTbl = new LinkedHashMap<HGNode.Signature, HGNode>();
+
+ // signature by lhs
+ private Map<Integer, SuperNode> superNodesTbl = new HashMap<Integer, SuperNode>();
+
+ /**
+ * sort values in nodesSigTbl, we need this list when necessary
+ */
+ private List<HGNode> sortedNodes = null;
+
+ // ===============================================================
+ // Static fields
+ // ===============================================================
+ private static final Logger logger = Logger.getLogger(Cell.class.getName());
+
+ // ===============================================================
+ // Constructor
+ // ===============================================================
+
+ public Cell(Chart chart, int goalSymID) {
+ this.chart = chart;
+ this.goalSymbol = goalSymID;
+ }
+
+ public Cell(Chart chart, int goal_sym_id, int constraint_symbol_id) {
+ this(chart, goal_sym_id);
+ }
+
+ // ===============================================================
+ // Package-protected methods
+ // ===============================================================
+
+ public Set<Integer> getKeySet() {
+ return superNodesTbl.keySet();
+ }
+
+ public SuperNode getSuperNode(int lhs) {
+ return superNodesTbl.get(lhs);
+ }
+
+ /**
+ * This function loops over all items in the top-level bin (covering the input sentence from
+ * <s> ... </s>), looking for items with the goal LHS. For each of these,
+ * add all the items with GOAL_SYM state into the goal bin the goal bin has only one Item, which
+ * itself has many hyperedges only "goal bin" should call this function
+ */
+ // note that the input bin is bin[0][n], not the goal bin
+ boolean transitToGoal(Cell bin, List<FeatureFunction> featureFunctions, int sentenceLength) {
+ this.sortedNodes = new ArrayList<HGNode>();
+ HGNode goalItem = null;
+
+ for (HGNode antNode : bin.getSortedNodes()) {
+ if (antNode.lhs == this.goalSymbol) {
+ float logP = antNode.bestHyperedge.getBestDerivationScore();
+ List<HGNode> antNodes = new ArrayList<HGNode>();
+ antNodes.add(antNode);
+
+ float finalTransitionLogP = ComputeNodeResult.computeFinalCost(featureFunctions, antNodes,
+ 0, sentenceLength, null, this.chart.getSentence());
+
+ List<HGNode> previousItems = new ArrayList<HGNode>();
+ previousItems.add(antNode);
+
+ HyperEdge dt = new HyperEdge(null, logP + finalTransitionLogP, finalTransitionLogP,
+ previousItems, null);
+
+ if (null == goalItem) {
+ goalItem = new HGNode(0, sentenceLength + 1, this.goalSymbol, null, dt, logP
+ + finalTransitionLogP);
+ this.sortedNodes.add(goalItem);
+ } else {
+ goalItem.addHyperedgeInNode(dt);
+ }
+ } // End if item.lhs == this.goalSymID
+ } // End foreach Item in bin.get_sorted_items()
+
+ int itemsInGoalBin = getSortedNodes().size();
+ if (1 != itemsInGoalBin) {
+ logger.severe("the goal_bin does not have exactly one item");
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * a note about pruning: when a hyperedge gets created, it first needs to pass through
+ * shouldPruneEdge filter. Then, if it does not trigger a new node (i.e. will be merged to an old
+ * node), then does not trigger pruningNodes. If it does trigger a new node (either because its
+ * signature is new or because its logP is better than the old node's logP), then it will trigger
+ * pruningNodes, which might causes *other* nodes got pruned as well
+ * */
+
+ /**
+ * Creates a new hyperedge and adds it to the chart, subject to pruning. The logic of this
+ * function is as follows: if the pruner permits the edge to be added, we build the new edge,
+ * which ends in an HGNode. If this is the first time we've built an HGNode for this point in the
+ * graph, it gets added automatically. Otherwise, we add the hyperedge to the existing HGNode,
+ * possibly updating the HGNode's cache of the best incoming hyperedge.
+ *
+ * @return the new hypernode, or null if the cell was pruned.
+ */
+ HGNode addHyperEdgeInCell(ComputeNodeResult result, Rule rule, int i, int j, List<HGNode> ants,
+ SourcePath srcPath, boolean noPrune) {
+
+// System.err.println(String.format("ADD_EDGE(%d-%d): %s", i, j, rule.getRuleString()));
+// if (ants != null) {
+// for (int xi = 0; xi < ants.size(); xi++) {
+// System.err.println(String.format(" -> TAIL %s", ants.get(xi)));
+// }
+// }
+
+ List<DPState> dpStates = result.getDPStates();
+ float pruningEstimate = result.getPruningEstimate();
+ float transitionLogP = result.getTransitionCost();
+ float finalizedTotalLogP = result.getViterbiCost();
+
+ /**
+ * Here, the edge has passed pre-pruning. The edge will be added to the chart in one of three
+ * ways:
+ *
+ * 1. If there is no existing node, a new one gets created and the edge is its only incoming
+ * hyperedge.
+ *
+ * 2. If there is an existing node, the edge will be added to its list of incoming hyperedges,
+ * possibly taking place as the best incoming hyperedge for that node.
+ */
+
+ HyperEdge hyperEdge = new HyperEdge(rule, finalizedTotalLogP, transitionLogP, ants, srcPath);
+ HGNode newNode = new HGNode(i, j, rule.getLHS(), dpStates, hyperEdge, pruningEstimate);
+
+ /**
+ * each node has a list of hyperedges, need to check whether the node is already exist, if
+ * yes, just add the hyperedges, this may change the best logP of the node
+ * */
+ HGNode oldNode = this.nodesSigTbl.get(newNode.signature());
+ if (null != oldNode) { // have an item with same states, combine items
+ this.chart.nMerged++;
+
+ /**
+ * the position of oldItem in this.heapItems may change, basically, we should remove the
+ * oldItem, and re-insert it (linear time), this is too expense)
+ **/
+ if (newNode.getScore() > oldNode.getScore()) { // merge old to new: semiring plus
+
+ newNode.addHyperedgesInNode(oldNode.hyperedges);
+ // This will update the HashMap, so that the oldNode is destroyed.
+ addNewNode(newNode);
+ } else {// merge new to old, does not trigger pruningItems
+ oldNode.addHyperedgesInNode(newNode.hyperedges);
+ }
+
+ } else { // first time item
+ this.chart.nAdded++; // however, this item may not be used in the future due to pruning in
+ // the hyper-graph
+ addNewNode(newNode);
+ }
+
+ return newNode;
+ }
+
+ List<HGNode> getSortedNodes() {
+ ensureSorted();
+ return this.sortedNodes;
+ }
+
+ Map<Integer, SuperNode> getSortedSuperItems() {
+ ensureSorted();
+ return this.superNodesTbl;
+ }
+
+ // ===============================================================
+ // Private Methods
+ // ===============================================================
+
+ /**
+ * two cases this function gets called (1) a new hyperedge leads to a non-existing node signature
+ * (2) a new hyperedge's signature matches an old node's signature, but the best-logp of old node
+ * is worse than the new hyperedge's logP
+ * */
+ private void addNewNode(HGNode node) {
+ this.nodesSigTbl.put(node.signature(), node); // add/replace the item
+ this.sortedNodes = null; // reset the list
+
+// System.err.println(String.format("** NEW NODE %s %d %d", Vocabulary.word(node.lhs), node.i, node.j));
+
+ // since this.sortedItems == null, this is not necessary because we will always call
+ // ensure_sorted to reconstruct the this.tableSuperItems
+ // add a super-items if necessary
+ SuperNode si = this.superNodesTbl.get(node.lhs);
+ if (null == si) {
+ si = new SuperNode(node.lhs);
+ this.superNodesTbl.put(node.lhs, si);
+ }
+ si.nodes.add(node);// TODO what about the dead items?
+ }
+
+ /**
+ * get a sorted list of Nodes in the cell, and also make sure the list of node in any SuperItem is
+ * sorted, this will be called only necessary, which means that the list is not always sorted,
+ * mainly needed for goal_bin and cube-pruning
+ */
+ private void ensureSorted() {
+ if (null == this.sortedNodes) {
+
+ // get sortedNodes.
+ this.sortedNodes = new ArrayList<>(this.nodesSigTbl.size());
+ for (HGNode node : this.nodesSigTbl.values()) {
+ this.sortedNodes.add(node);
+ }
+
+ // sort the node in an decreasing-LogP order
+ this.sortedNodes.sort(HGNode.inverseLogPComparator);
+
+ // TODO: we cannot create new SuperItem here because the DotItem link to them.
+ // Thus, we clear nodes from existing SuperNodes
+ for (SuperNode superNode : this.superNodesTbl.values()) {
+ superNode.nodes.clear();
+ }
+
+ for (HGNode node : this.sortedNodes) {
+ SuperNode superNode = this.superNodesTbl.get(node.lhs);
+ checkNotNull(superNode, "Does not have super Item, have to exist");
+ superNode.nodes.add(node);
+ }
+
+ // Remove SuperNodes who may not contain any nodes anymore due to pruning
+ for (Iterator<Entry<Integer, SuperNode>> it = this.superNodesTbl.entrySet().iterator(); it.hasNext(); ) {
+ Entry<Integer, SuperNode> entry = it.next();
+ if (entry.getValue().nodes.isEmpty()) {
+ it.remove();
+ }
+ }
+ }
+ }
+}